gamingflexer commited on
Commit
8d572ee
·
1 Parent(s): 1ade467

Add plagiarism checker functionality

Browse files
Files changed (1) hide show
  1. src/plagiarism/checker.py +122 -0
src/plagiarism/checker.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import arxiv
3
+ from plagiarism.preprocessing import get_pdf_info
4
+ from langchain_core.runnables import RunnablePassthrough
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from config import OPENAI_API_KEY
9
+ from scrapper.extractor import get_google_scrape
10
+ from utils import extract_json_from_text, check_id_extis_in_json, generate_uuid,text_splitter
11
+ from db.vector_fucntions import add_document_chroma_collection
12
+ from prompts.templates import prompt_unquiness_para, google_search_titles
13
+ from scrapper.main import ArxivPaper
14
+ from config import *
15
+ from db.db_functions import get_correct_author_name
16
+ from db.vector_fucntions import collection_doc, collection_para, openai_ef
17
+
18
+
19
+ """
20
+ author_check = ArxivPaperAuthorPlagiarismCheck("Abney, Steven F", 3)
21
+ author_check.process_papers()
22
+ similar_paper_ids = author_check.find_similar_papers()
23
+ author_check.add_embeddings_to_db(similar_paper_ids)
24
+ """
25
+
26
+ class ArxivPaperAuthorPlagiarismCheck:
27
+
28
+ def __init__(self, author_name, num_results):
29
+ self.author_name = author_name
30
+ self.num_results = num_results
31
+ self.author_obj = ArxivPaper(self.author_name)
32
+ self.db_author_name = get_correct_author_name(self.author_name)
33
+ self.paper_links = self.author_obj.get_results_google(number_of_results=self.num_results)
34
+ self.paper_ids = self.author_obj.get_paper_id(self.paper_links)
35
+ self.data_papers = self.author_obj.get_paper_details_batch(self.paper_ids)
36
+
37
+ def process_papers(self):
38
+
39
+ prompt_unquiness = ChatPromptTemplate.from_template(prompt_unquiness_para)
40
+
41
+ output_parser_1 = StrOutputParser()
42
+ model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
43
+ chain_unquine = (
44
+ {"title": RunnablePassthrough(), "summary": RunnablePassthrough(), "text": RunnablePassthrough()}
45
+ | prompt_unquiness
46
+ | model
47
+ | output_parser_1
48
+ )
49
+
50
+ for single_paper in self.data_papers:
51
+ paper = next(arxiv.Client().results(arxiv.Search(id_list=[single_paper['id']])))
52
+ file_name = single_paper['id'] + ".pdf"
53
+ to_save_path = os.path.join("./data_temp/")
54
+ paper.download_pdf(dirpath=to_save_path, filename=file_name)
55
+ only_text, paragraphs = get_pdf_info(to_save_path + file_name)
56
+ response = chain_unquine.invoke({"title": single_paper['title'], "summary": single_paper['summary'], "text": str(paragraphs)})
57
+ response_list = extract_json_from_text(response)
58
+ single_paper['unique_paragraphs'] = response_list
59
+
60
+ def find_similar_papers(self):
61
+ prompt_relavent_title = ChatPromptTemplate.from_template(google_search_titles)
62
+ output_parser_2 = StrOutputParser()
63
+ model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
64
+ chain = (
65
+ {"title": RunnablePassthrough(), "summary": RunnablePassthrough()}
66
+ | prompt_relavent_title
67
+ | model
68
+ | output_parser_2
69
+ )
70
+
71
+ for single_paper in self.data_papers:
72
+ title = single_paper['title']
73
+ summary = single_paper['summary']
74
+ response = chain.invoke({"title": title, "summary": summary})
75
+ search_list = extract_json_from_text(response)
76
+ if search_list is not None:
77
+ search_list = search_list[:3]
78
+ paper_links_similar = []
79
+ for search in search_list:
80
+ result_dict = get_google_scrape(search)
81
+ for i in result_dict['organic_results']:
82
+ if "arxiv.org" in i['link']:
83
+ paper_links_similar.append(i['link'])
84
+
85
+ similar_paper_ids = []
86
+ for paper_link in paper_links_similar:
87
+ paper_id = paper_link.split("/")[-1]
88
+ similar_paper_ids.append(paper_id)
89
+
90
+ return similar_paper_ids
91
+
92
+ def add_embeddings_to_db(self, similar_paper_ids):
93
+ meta_data_similar_papers = self.author_obj.get_paper_details_batch(similar_paper_ids)
94
+
95
+ to_save_path = os.path.join(f"./data_temp/{str(generate_uuid())}/")
96
+ if not os.path.exists(to_save_path):
97
+ os.makedirs(to_save_path)
98
+
99
+ for single_paper in meta_data_similar_papers:
100
+ if not check_id_extis_in_json(single_paper['id']):
101
+ paper = next(arxiv.Client().results(arxiv.Search(id_list=[single_paper['id']])))
102
+ file_name = single_paper['id'] + ".pdf"
103
+ paper.download_pdf(dirpath=to_save_path, filename=file_name)
104
+ texts_data, paragraphs = get_pdf_info(to_save_path + file_name)
105
+ texts_list = text_splitter.split_text(texts_data)
106
+ single_paper['text'] = texts_list
107
+ single_paper['paragraphs'] = paragraphs
108
+
109
+ metadata = {
110
+ "title": single_paper['title'],
111
+ "summary": single_paper['summary'],
112
+ "authors": " ".join(single_paper['authors']),
113
+ "categories": " ".join(single_paper['categories']),
114
+ "id": single_paper['id']
115
+ }
116
+
117
+ if len(texts_list) > 0:
118
+ doc_embed = openai_ef(texts_list)
119
+ add_document_chroma_collection(collection_doc, texts_list, doc_embed, metadata)
120
+ if len(paragraphs) > 0:
121
+ paragraphs_embed = openai_ef(paragraphs)
122
+ add_document_chroma_collection(collection_para, paragraphs, paragraphs_embed, metadata)