import gradio as gr import pandas as pd import logging from scrapper.main import ArxivPaper from config import * from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data, get_unquine_authors from utils import compare_paper_ids import chromadb from chromadb.config import Settings from chromadb.utils import embedding_functions from config import OPENAI_API_KEY emmbedding_model = "text-embedding-3-large" openai_ef = embedding_functions.OpenAIEmbeddingFunction(model_name=emmbedding_model,api_key=OPENAI_API_KEY) if deploy: chroma_client = chromadb.HttpClient(host='localhost', port=8000) else: chroma_client = chromadb.PersistentClient(path="/home/ubuntu/research/data/emeddeings") collection_doc = chroma_client.get_or_create_collection(name="2024_document_lvl_test") def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.Progress()): number_of_results_fetch = int(number_of_results_fetch) print(authors_name_fetch,number_of_results_fetch,type(number_of_results_fetch)) progress(0.2, desc="Collecting Links") author_obj = ArxivPaper(authors_name_fetch) db_author_name = get_correct_author_name(authors_name_fetch) paper_links = author_obj.get_results_google(number_of_results=number_of_results_fetch) paper_ids = author_obj.get_paper_id(paper_links) progress(0.4, desc=f"Collecting Papers for {len(paper_ids)}") if db_author_name is None: data = fetch_papers_data(db_author_name) remaining_paper_ids = compare_paper_ids(data,paper_ids) print("No similar author found in the database") local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name_fetch.replace(" ", "_")) progress(0.6, desc="Making summary") data_to_save = [] if remaining_paper_ids != []: author_obj.get_paper_details_batch(paper_ids=paper_ids, path="./data/papers") for paper in os.listdir(local_saved_papers): paper_path = os.path.join(local_saved_papers, paper) with open(paper_path, "r") as f: data_to_save.append(f.read()) else: print(f"Found similar author in the database: {db_author_name}") data = fetch_papers_data(db_author_name) remaining_paper_ids = compare_paper_ids(data,paper_ids) local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name_fetch.replace(" ", "_")) progress(0.6, desc="Making summary") data_to_save = [] if remaining_paper_ids != []: author_obj.get_paper_details_batch(paper_ids=remaining_paper_ids, path="./data/papers") for paper in os.listdir(local_saved_papers): paper_path = os.path.join(local_saved_papers, paper) with open(paper_path, "r") as f: data_to_save.append(f.read()) else: print("All papers already present in the database") progress(0.8, desc="Saving to Database") insert_papers_data(data_to_save, authors_name_fetch) return f"Fetched Latest Papers for {len(remaining_paper_ids)}" def fetch_papers_data_df(authors_name: str, progress=gr.Progress()): progress(0.2, desc="Fetching Papers") fetched_data = fetch_papers_data(authors_name,fields_to_query=['doi_no', 'author_name', 'title', 'authors', 'year', 'pdf_link', 'references', 'categories', 'comment', 'journal_ref', 'source', 'summary', 'published']) progress(0.8, desc="Making DataFrame") return pd.DataFrame(fetched_data) def embedding_searcher(embbed_text_search, top_k=4, progress=gr.Progress()): data = collection_doc.query(query_embeddings = openai_ef([embbed_text_search]), n_results=top_k) result = pd.DataFrame(data['ids'][0], columns=['ID']) result['Distance'] = data['distances'][0] # Extracting information from metadatas metadata_list = data['metadatas'][0] titles = [metadata['title'] for metadata in metadata_list] authors = [metadata['authors'] for metadata in metadata_list] sources = [metadata['source'] for metadata in metadata_list] # Adding metadata columns to the dataframe result['Title'] = titles result['Authors'] = authors result['Source'] = sources return result with gr.Blocks() as demo: with gr.Tab("Get Papers Data"): with gr.Row(): authors_name_paper = gr.Textbox(label="Enter Author's Name") submit_button_tab_2 = gr.Button("Start") with gr.Row(): dataframe_output = gr.Dataframe(headers=['doi_no', 'author_name', 'title', 'authors', 'year', 'pdf_link', 'references', 'categories', 'comment', 'journal_ref', 'source', 'summary', 'published']) with gr.Row(): unquine_authors_output = gr.Dataframe(headers=["author_name"],value=get_unquine_authors(), label=" Authors Currently in our DB") with gr.Tab("Arxiv Plagiarism Fetcher & Save to DB"): with gr.Row(): authors_name_fetch = gr.Textbox(label="Enter Author's Name") number_of_results_fetch = gr.Textbox(label="Number of results - Min - 5") submit_button_tab_1 = gr.Button("Start") with gr.Row(): completed = gr.Textbox(label="Completed") with gr.Tab("Arxiv Plagiarism Checker"): with gr.Row(): authors_name = gr.Textbox(label="Enter Author's Name") number_of_results = gr.Number(label="Number of results - Min - 5") submit_button = gr.Button("Start") with gr.Tab("Open Embeddings Search"): with gr.Row(): embbed_text_search = gr.Textbox(label="Enter Text") with gr.Row(): top_k = gr.Number(label="Number of results - Min 2") with gr.Row(): submit_button_tab_4 = gr.Button("Start") dataframe_output_tab_4 = gr.Dataframe(headers=['ID', 'Distance', 'Title', 'Authors', 'Source']) submit_button_tab_1.click(fn=plagiarism_checker,inputs=[authors_name_fetch, number_of_results_fetch] ,outputs= completed) submit_button_tab_2.click(fn=fetch_papers_data_df,inputs=[authors_name_paper] ,outputs=dataframe_output) submit_button_tab_4.click(fn=embedding_searcher,inputs=[embbed_text_search, top_k] ,outputs= dataframe_output_tab_4) demo.launch()