Spaces:

ShivanshMathur007
/

MoP

Sleeping

File size: 5,606 Bytes

from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import gradio as gr
import re
import shutil
import glob
def delete_files_except(folder_path):
  delfile=None
  files = glob.glob(os.path.join(folder_path, '*'))
  delfile=os.path.basename(files[0])
  os.remove(folder_path+'/'+delfile)
  return delfile


def MOP(path):
  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
  template_vectors = FAISS.load_local('/content/drive/MyDrive/Innovation Themes /MoP/Template_embeddings', embeddings, allow_dangerous_deserialization=True)
  text_chunks1=8
  vectors_template=template_vectors
  docs=[]
  loader = PyPDFDirectoryLoader(path)
  docs = loader.load()
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200 , chunk_overlap=300)
  text_chunks2 = text_splitter.split_documents(docs)
  vector_Document = FAISS.from_documents(text_chunks2, embedding=embeddings)
  vectors_template.merge_from(vector_Document)
  # filename_to_keep = 'Dummy_standard MoP_template_new.pdf'

  prompt_file = delete_files_except(path)
   # prompt_file=delete_files_except(filename_to_keep,path)
  repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1"
  llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature": 0.001, "max_new_tokens": 5000})
  retriever = vectors_template.as_retriever(search_type="similarity",search_kwargs={"k": text_chunks1+len(text_chunks2)})
  qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)


  
  message1= f"""<s> [INST] You have two documents:

Template Document: "Standard_MoP_template.pdf"
Content Document: "{prompt_file}"
Evaluate whether the Content Document ("{prompt_file}") effectively integrates most of the necessary context and instructions for the "Pre-Check  Procedures" listed in the Template Document "Standard_MoP_template.pdf".

Instructions:

Review the "Pre-Check  Procedures" listed in the Template Document.
Assess whether the Content Document ("{prompt_file}") incorporates the necessary context and instructions for these points, whether they are explicitly stated under a section labeled "Pre-Check  Procedures" or implicitly covered elsewhere in the document.
Provide a "Yes" if the Content Document effectively integrates most of the points listed under the "Pre-Check  Procedures", considering both explicit and implicit coverage.
If the integration is lacking or incomplete, carefully reconsider whether the points are adequately addressed within the Content Document.
If you provide a "Yes," provide a detailed explanation highlighting how the Content Document integrates the Pre-Check  Procedures effectively. Discuss both explicit mentions and any implicit coverage that contributes to their implementation.
Make sure to accurately evaluate the alignment of the "Pre-Check  Procedures" provided in the Template Document with their presence or coverage in the Content Document. [/INST] </s>"""

  message2= f"""<s> [INST] You have two documents:

Template Document: "Standard_MoP_template.pdf"
Content Document: "{prompt_file}"
Evaluate whether the Content Document ("{prompt_file}") effectively integrates most of the necessary context and instructions for the "Post-Check  Procedures" listed in the Template Document "Standard_MoP_template.pdf".

Instructions:

Review the "Post-Check  Procedures" listed in the Template Document.
Assess whether the Content Document ("{prompt_file}") incorporates the necessary context and instructions for these points, whether they are explicitly stated under a section labeled "Post-Check  Procedures" or implicitly covered elsewhere in the document.
Provide a "Yes" if the Content Document effectively integrates most of the points listed under the "Post-Check  Procedures", considering both explicit and implicit coverage.
If the integration is lacking or incomplete, carefully reconsider whether the points are adequately addressed within the Content Document.
If you provide a "Yes," provide a detailed explanation highlighting how the Content Document integrates the Post-Check  Procedures effectively. Discuss both explicit mentions and any implicit coverage that contributes to their implementation.
Make sure to accurately evaluate the alignment of the "Post-Check  Procedures" provided in the Template Document with their presence or coverage in the Content Document. [/INST] </s>"""

  result1=qa.run(message1)
  result2=qa.run(message2)

  patterns = [
    r'Helpful Answer:\n\n(.*)$',
    r'Helpful Answer: \n\n(.*)$'
  ]
  answer=list()
  answer.append(result1)
  answer.append(result2)
  helpful_answer_texts_o = ""
  for result in answer:
      for pattern in patterns:
          match = re.search(pattern, result, re.DOTALL)
          if match:
              helpful_answer_texts_o += match.group(1) + "\n\n ************************************************************"

  helpful_answer_texts_o = helpful_answer_texts_o.strip()

  if helpful_answer_texts_o:
    return helpful_answer_texts_o
  
  return result1 + result2 
  

def process_file(fileobj):
  destination_folder="/content/check/"
  file_name = os.path.basename(fileobj)
  destination_path = os.path.join(destination_folder, file_name)
  shutil.copyfile(fileobj.name, destination_path)
  return MOP(destination_folder)


demo = gr.Interface(
    fn=process_file,
    inputs=[
        "file",
    ],
    outputs="text"
)
demo.launch()