from langchain.vectorstores import FAISS from langchain.chains import RetrievalQA from langchain.llms import HuggingFaceHub import os from langchain.embeddings import HuggingFaceEmbeddings from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import gradio as gr import re import shutil import glob def delete_files_except(filename, folder_path): delfile=None files = glob.glob(os.path.join(folder_path, '*')) for file in files: if os.path.basename(file) == filename: continue else: delfile=os.path.basename(file) os.remove(file) return delfile def MOP(path): docs=[] loader = PyPDFDirectoryLoader(path) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000 , chunk_overlap=500) text_chunks = text_splitter.split_documents(docs) embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-base") vector_store = FAISS.from_documents(text_chunks, embedding=embeddings) filename_to_keep = 'Dummy_standard MoP_template_new.pdf' prompt_file=delete_files_except(filename_to_keep,path) repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1" llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature": 0.1, "max_new_tokens": 2048}) retriever = vector_store.as_retriever(search_type="similarity",search_kwargs={"k": len(text_chunks)}) qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever,verbose=True) message= f""" [INST] You have two documents: Template Document: "Dummy_standard MoP_template_new.pdf" Content Document: "{prompt_file}" Your task is to determine whether the Content Document effectively incorporates the context and instructions of the Pre-Check Procedures and Post-Check Procedures specified in the Template Document, without the requirement for them to be under specified labels. The focus should be on the presence of the context and instructions rather than their exact placement. Provide a "Yes" or "No" response indicating whether the Content Document accurately integrates the context and instructions of the Pre-Check Procedures and Post-Check Procedures as outlined in the Template Document. Additionally, identify any missing elements related to the context and instructions of the Pre-Check Procedures and Post-Check Procedures if present, regardless of their placement within the Content Document. Instructions: Review the context and instructions of the Pre-Check Procedures and Post-Check Procedures detailed in the Template Document ("Dummy_standard MoP_template_new.pdf"). Assess whether the Content Document ("{prompt_file}") includes the necessary context and instructions for the Pre-Check Procedures and Post-Check Procedures, regardless of their specific placement or labeling. Provide a "Yes" if the Content Document adequately integrates the context and instructions of the Pre-Check Procedures and Post-Check Procedures, or "No" if there are significant gaps or omissions. If the answer is "No," specify any missing elements related to the context and instructions of the Pre-Check Procedures and Post-Check Procedures, emphasizing their importance in the Content Document. Ensure careful consideration of the context and instructions provided in the Template Document ("Dummy_standard MoP_template_new.pdf") while evaluating the alignment of the Content Document ("{prompt_file}"). [/INST] """ result=qa.run(message) pattern = r"Helpful Answer:\n\n(.*)" match = re.search(pattern, result, re.DOTALL) if match: helpful_answer_text = match.group(1) return helpful_answer_text return result def process_file(fileobj): destination_folder="data" file_name = os.path.basename(fileobj) destination_path = os.path.join(destination_folder, file_name) shutil.copyfile(fileobj.name, destination_path) return MOP(destination_folder) demo = gr.Interface( fn=process_file, inputs=[ "file", ], outputs="text" ) demo.launch()