Spaces:
Runtime error
Runtime error
import os | |
import arxiv | |
from plagiarism.preprocessing import get_pdf_info | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_openai import ChatOpenAI | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_core.output_parsers import StrOutputParser | |
from config import OPENAI_API_KEY | |
from scrapper.extractor import get_google_scrape | |
from utils import extract_json_from_text, check_id_extis_in_json, generate_uuid,text_splitter | |
from db.vector_fucntions import add_document_chroma_collection | |
from prompts.templates import prompt_unquiness_para, google_search_titles | |
from scrapper.main import ArxivPaper | |
from config import * | |
from db.db_functions import get_correct_author_name | |
from db.vector_fucntions import collection_doc, collection_para, openai_ef | |
""" | |
author_check = ArxivPaperAuthorPlagiarismCheck("Abney, Steven F", 3) | |
author_check.process_papers() | |
similar_paper_ids = author_check.find_similar_papers() | |
author_check.add_embeddings_to_db(similar_paper_ids) | |
""" | |
class ArxivPaperAuthorPlagiarismCheck: | |
def __init__(self, author_name, num_results): | |
self.author_name = author_name | |
self.num_results = num_results | |
self.author_obj = ArxivPaper(self.author_name) | |
self.db_author_name = get_correct_author_name(self.author_name) | |
self.paper_links = self.author_obj.get_results_google(number_of_results=self.num_results) | |
self.paper_ids = self.author_obj.get_paper_id(self.paper_links) | |
self.data_papers = self.author_obj.get_paper_details_batch(self.paper_ids) | |
def process_papers(self): | |
prompt_unquiness = ChatPromptTemplate.from_template(prompt_unquiness_para) | |
output_parser_1 = StrOutputParser() | |
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY) | |
chain_unquine = ( | |
{"title": RunnablePassthrough(), "summary": RunnablePassthrough(), "text": RunnablePassthrough()} | |
| prompt_unquiness | |
| model | |
| output_parser_1 | |
) | |
for single_paper in self.data_papers: | |
paper = next(arxiv.Client().results(arxiv.Search(id_list=[single_paper['id']]))) | |
file_name = single_paper['id'] + ".pdf" | |
to_save_path = os.path.join("./data_temp/") | |
paper.download_pdf(dirpath=to_save_path, filename=file_name) | |
only_text, paragraphs = get_pdf_info(to_save_path + file_name) | |
response = chain_unquine.invoke({"title": single_paper['title'], "summary": single_paper['summary'], "text": str(paragraphs)}) | |
response_list = extract_json_from_text(response) | |
single_paper['unique_paragraphs'] = response_list | |
def find_similar_papers(self): | |
prompt_relavent_title = ChatPromptTemplate.from_template(google_search_titles) | |
output_parser_2 = StrOutputParser() | |
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY) | |
chain = ( | |
{"title": RunnablePassthrough(), "summary": RunnablePassthrough()} | |
| prompt_relavent_title | |
| model | |
| output_parser_2 | |
) | |
for single_paper in self.data_papers: | |
title = single_paper['title'] | |
summary = single_paper['summary'] | |
response = chain.invoke({"title": title, "summary": summary}) | |
search_list = extract_json_from_text(response) | |
if search_list is not None: | |
search_list = search_list[:3] | |
paper_links_similar = [] | |
for search in search_list: | |
result_dict = get_google_scrape(search) | |
for i in result_dict['organic_results']: | |
if "arxiv.org" in i['link']: | |
paper_links_similar.append(i['link']) | |
similar_paper_ids = [] | |
for paper_link in paper_links_similar: | |
paper_id = paper_link.split("/")[-1] | |
similar_paper_ids.append(paper_id) | |
return similar_paper_ids | |
def add_embeddings_to_db(self, similar_paper_ids): | |
meta_data_similar_papers = self.author_obj.get_paper_details_batch(similar_paper_ids) | |
to_save_path = os.path.join(f"./data_temp/{str(generate_uuid())}/") | |
if not os.path.exists(to_save_path): | |
os.makedirs(to_save_path) | |
for single_paper in meta_data_similar_papers: | |
if not check_id_extis_in_json(single_paper['id']): | |
paper = next(arxiv.Client().results(arxiv.Search(id_list=[single_paper['id']]))) | |
file_name = single_paper['id'] + ".pdf" | |
paper.download_pdf(dirpath=to_save_path, filename=file_name) | |
texts_data, paragraphs = get_pdf_info(to_save_path + file_name) | |
texts_list = text_splitter.split_text(texts_data) | |
single_paper['text'] = texts_list | |
single_paper['paragraphs'] = paragraphs | |
metadata = { | |
"title": single_paper['title'], | |
"summary": single_paper['summary'], | |
"authors": " ".join(single_paper['authors']), | |
"categories": " ".join(single_paper['categories']), | |
"id": single_paper['id'] | |
} | |
if len(texts_list) > 0: | |
doc_embed = openai_ef(texts_list) | |
add_document_chroma_collection(collection_doc, texts_list, doc_embed, metadata) | |
if len(paragraphs) > 0: | |
paragraphs_embed = openai_ef(paragraphs) | |
add_document_chroma_collection(collection_para, paragraphs, paragraphs_embed, metadata) | |