import streamlit as st import openai from openai import OpenAI from brain import get_index_for_documents from langchain.chains import RetrievalQA from langchain_community.chat_models import ChatOpenAI from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from dotenv import load_dotenv import os from evaluation_module import RAGEvaluator # Set the title for the Streamlit app st.title("DocuChat with Evaluation") # Set up the OpenAI client client = OpenAI() load_dotenv() # Load variables from .env openai.api_key = os.getenv("OPENAI_API_KEY") # Initialize evaluator evaluator = RAGEvaluator() # Function to create vector database from different file types @st.cache_resource def create_vectordb(files, filenames, raw_texts): with st.spinner("Creating vector database..."): vectordb = get_index_for_documents( [file.getvalue() for file in files if file.type == "application/pdf"], filenames, [raw_text for raw_text in raw_texts.splitlines() if raw_text.strip()], openai.api_key ) return vectordb # Upload files using Streamlit's file uploader uploaded_files = st.file_uploader("Upload your documents (PDF or TXT)", type=["pdf", "txt"], accept_multiple_files=True, label_visibility="hidden") # Text area for raw text input raw_text = st.text_area("Or enter your raw text here:", height=150) # If files are uploaded or raw text is provided, create the vectordb and store it in the session state if uploaded_files or raw_text: file_names = [file.name for file in uploaded_files] if uploaded_files else [] st.session_state["vectordb"] = create_vectordb(uploaded_files, file_names, raw_text) # Define the template for the chatbot prompt prompt_template = """ You are a helpful Assistant who answers to users questions based on multiple contexts given to you. Keep your answer short and to the point. The evidence is the context of the document extract with metadata. Carefully focus on the metadata, especially 'filename' and 'page' whenever answering. Make sure to add filename and page number at the end of the sentence you are citing to. Also be able to give a summary based on the document extract given to you, but do not hallucinate. Reply "Not applicable" if text is irrelevant. The document content is: {doc_extract} """ # Get the current prompt from the session state or set a default value prompt = st.session_state.get("prompt", [{"role": "system", "content": "none"}]) # Display previous chat messages for message in prompt: if message["role"] != "system": with st.chat_message(message["role"]): st.write(message["content"]) # Get the user's question using Streamlit's chat input question = st.chat_input("Ask anything") # Handle the user's question if question: vectordb = st.session_state.get("vectordb", None) if not vectordb: with st.chat_message("assistant"): st.write("You need to provide a PDF, TXT file, or raw text.") st.stop() # Search the vectordb for similar content to the user's question search_results = vectordb.similarity_search(question, k=3) doc_extract = "\n".join([result.page_content for result in search_results]) # Update the prompt with the document extract prompt[0] = { "role": "system", "content": prompt_template.format(doc_extract=doc_extract), } # Add the user's question to the prompt and display it prompt.append({"role": "user", "content": question}) with st.chat_message("user"): st.write(question) # Display an empty assistant message while waiting for the response with st.chat_message("assistant"): botmsg = st.empty() # Call ChatGPT with streaming and display the response as it comes response = [] result = "" for chunk in client.chat.completions.create( model="gpt-3.5-turbo", messages=prompt, stream=True ): text = chunk.choices[0].delta.content if text is not None: response.append(text) result = "".join(response).strip() botmsg.write(result) # Add the assistant's response to the prompt prompt.append({"role": "assistant", "content": result}) # Store the updated prompt in the session state st.session_state["prompt"] = prompt # Evaluation Section st.write("## Evaluation Results") if st.button("Evaluate Response"): if doc_extract and result: # Perform evaluation metrics = evaluator.evaluate_all(result, doc_extract) # Display metrics with explanations st.write(f"**BLEU Score**: {metrics['BLEU']:.2f}") st.write("BLEU measures the overlap between the generated output and reference text based on n-grams. Range: 0-100. Higher scores indicate better match.") st.write(f"**ROUGE-1 Score**: {metrics['ROUGE-1']:.2f}") st.write("ROUGE-1 measures the overlap of unigrams between the generated output and reference text. Range: 0-1. Higher scores indicate better match.") st.write(f"**BERT Precision**: {metrics['BERT P']:.2f}") st.write(f"**BERT Recall**: {metrics['BERT R']:.2f}") st.write(f"**BERT F1 Score**: {metrics['BERT F1']:.2f}") st.write("BERTScore evaluates the semantic similarity between the generated output and reference text using BERT embeddings. Range: 0-1. Higher scores indicate better semantic similarity.") st.write(f"**Perplexity**: {metrics['Perplexity']:.2f}") st.write("Perplexity measures how well a language model predicts the text. Range: 1 to ∞. Lower values indicate better fluency and coherence.") st.write(f"**Diversity**: {metrics['Diversity']:.2f}") st.write("Diversity measures the uniqueness of bigrams in the generated output. Range: 0-1. Higher values indicate more diverse and varied output.")