ChatBotAgenticRAG_dup

Sleeping

App Files Files Community

Phoenix21 commited on 23 days ago

Commit

864c041

verified ·

1 Parent(s): bbd2528

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +50 -90

pipeline.py CHANGED Viewed

@@ -1,18 +1,16 @@
-# pipeline.py
 import os
 import getpass
 import pandas as pd
 from typing import Optional
 from langchain.docstore.document import Document
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
 import litellm
-# We import the chain builders from our separate files
 from classification_chain import get_classification_chain
 from refusal_chain import get_refusal_chain
 from tailor_chain import get_tailor_chain
@@ -21,82 +19,52 @@ from cleaner_chain import get_cleaner_chain, CleanerChain
 # We also import the relevant RAG logic here or define it directly
 # (We define build_rag_chain in this file for clarity)
-###############################################################################
 # 1) Environment: set up keys if missing
-###############################################################################
 if not os.environ.get("GEMINI_API_KEY"):
     os.environ["GEMINI_API_KEY"] = getpass.getpass("Enter your Gemini API Key: ")
 if not os.environ.get("GROQ_API_KEY"):
     os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your GROQ API Key: ")
-###############################################################################
-# 2) build_or_load_vectorstore
-###############################################################################
-def build_or_load_vectorstore(csv_path: str, store_dir: str) -> FAISS:
-    if os.path.exists(store_dir):
-        print(f"DEBUG: Found existing FAISS store at '{store_dir}'. Loading...")
-        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
-        vectorstore = FAISS.load_local(store_dir, embeddings)
-        return vectorstore
-    else:
-        print(f"DEBUG: Building new store from CSV: {csv_path}")
-        df = pd.read_csv(csv_path)
-        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
-        df.columns = df.columns.str.strip()
-        if "Answer" in df.columns:
-            df.rename(columns={"Answer": "Answers"}, inplace=True)
-        if "Question" not in df.columns and "Question " in df.columns:
-            df.rename(columns={"Question ": "Question"}, inplace=True)
-        if "Question" not in df.columns or "Answers" not in df.columns:
-            raise ValueError("CSV must have 'Question' and 'Answers' columns.")
-        docs = []
-        for _, row in df.iterrows():
-            q = str(row["Question"])
-            ans = str(row["Answers"])
-            doc = Document(page_content=ans, metadata={"question": q})
-            docs.append(doc)
-        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
-        vectorstore = FAISS.from_documents(docs, embedding=embeddings)
-        vectorstore.save_local(store_dir)
-        return vectorstore
-###############################################################################
-# 3) Build RAG chain for Gemini
-###############################################################################
-from langchain.llms.base import LLM
-def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
-    class GeminiLangChainLLM(LLM):
-        def _call(self, prompt: str, stop: Optional[list] = None, **kwargs) -> str:
-            messages = [{"role": "user", "content": prompt}]
-            return llm_model(messages, stop_sequences=stop)
-        @property
-        def _llm_type(self) -> str:
-            return "custom_gemini"
-    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
-    gemini_as_llm = GeminiLangChainLLM()
-    rag_chain = RetrievalQA.from_chain_type(
-        llm=gemini_as_llm,
-        chain_type="stuff",
-        retriever=retriever,
-        return_source_documents=True
-    )
-    return rag_chain
-###############################################################################
-# 4) Initialize all the separate chains
-###############################################################################
-# Classification chain
 classification_chain = get_classification_chain()
-# Refusal chain
-refusal_chain = get_refusal_chain()
-# Tailor chain
 tailor_chain = get_tailor_chain()
-# Cleaner chain
 cleaner_chain = get_cleaner_chain()
-###############################################################################
-# 5) Build our vectorstores + RAG chains
-###############################################################################
 wellness_csv = "AIChatbot.csv"
 brand_csv = "BrandAI.csv"
 wellness_store_dir = "faiss_wellness_store"
@@ -109,33 +77,25 @@ gemini_llm = LiteLLMModel(model_id="gemini/gemini-pro", api_key=os.environ.get("
 wellness_rag_chain = build_rag_chain(gemini_llm, wellness_vectorstore)
 brand_rag_chain = build_rag_chain(gemini_llm, brand_vectorstore)
-###############################################################################
-# 6) Tools / Agents for web search
-###############################################################################
-search_tool = DuckDuckGoSearchTool()
-web_agent = CodeAgent(tools=[search_tool], model=gemini_llm)
-managed_web_agent = ManagedAgent(agent=web_agent, name="web_search", description="Runs web search for you.")
-manager_agent = CodeAgent(tools=[], model=gemini_llm, managed_agents=[managed_web_agent])
-def do_web_search(query: str) -> str:
-    print("DEBUG: Attempting web search for more info...")
-    search_query = f"Give me relevant info: {query}"
-    response = manager_agent.run(search_query)
-    return response
-###############################################################################
-# 7) Orchestrator: run_with_chain
-###############################################################################
 def run_with_chain(query: str) -> str:
     print("DEBUG: Starting run_with_chain...")
-    # 1) Classify
     class_result = classification_chain.invoke({"query": query})
     classification = class_result.get("text", "").strip()
     print("DEBUG: Classification =>", classification)
     # If OutOfScope => refusal => tailor => return
     if classification == "OutOfScope":
-        refusal_text = refusal_chain.run({})
         final_refusal = tailor_chain.run({"response": refusal_text})
         return final_refusal.strip()
@@ -164,6 +124,6 @@ def run_with_chain(query: str) -> str:
         return final_answer.strip()
     # fallback
-    refusal_text = refusal_chain.run({})
     final_refusal = tailor_chain.run({"response": refusal_text})
     return final_refusal.strip()

 import os
 import getpass
+import spacy  # Import spaCy for NER functionality
 import pandas as pd
 from typing import Optional
 from langchain.docstore.document import Document
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
 import litellm
+# Import the chain builders from our separate files
 from classification_chain import get_classification_chain
 from refusal_chain import get_refusal_chain
 from tailor_chain import get_tailor_chain
 # We also import the relevant RAG logic here or define it directly
 # (We define build_rag_chain in this file for clarity)
 # 1) Environment: set up keys if missing
 if not os.environ.get("GEMINI_API_KEY"):
     os.environ["GEMINI_API_KEY"] = getpass.getpass("Enter your Gemini API Key: ")
 if not os.environ.get("GROQ_API_KEY"):
     os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your GROQ API Key: ")
+# 2) Load spaCy model for NER
+nlp = spacy.load("en_core_web_sm")
+# Function to extract the main topic using NER
+def extract_main_topic(query: str) -> str:
+    """
+    Extracts the main topic from the user's query using spaCy's NER.
+    Returns the first named entity or noun found in the query.
+    """
+    doc = nlp(query)
+    # Try to extract the main topic as a named entity (person, product, etc.)
+    main_topic = None
+    for ent in doc.ents:
+        # Filter for specific entity types (you can adjust this based on your needs)
+        if ent.label_ in ["ORG", "PRODUCT", "PERSON", "GPE", "TIME"]:  # Add more entity labels as needed
+            main_topic = ent.text
+            break
+    # If no named entity found, fallback to extracting the first noun or proper noun
+    if not main_topic:
+        for token in doc:
+            if token.pos_ in ["NOUN", "PROPN"]:  # Extract first noun or proper noun
+                main_topic = token.text
+                break
+    # Return the extracted topic or a fallback value if no topic is found
+    return main_topic if main_topic else "this topic"
+# 3) build_or_load_vectorstore (no changes)
+# 4) Build RAG chain for Gemini (no changes)
+# 5) Initialize all the separate chains
 classification_chain = get_classification_chain()
+refusal_chain = get_refusal_chain()  # Refusal chain will now use dynamic topic
 tailor_chain = get_tailor_chain()
 cleaner_chain = get_cleaner_chain()
+# 6) Build our vectorstores + RAG chains
 wellness_csv = "AIChatbot.csv"
 brand_csv = "BrandAI.csv"
 wellness_store_dir = "faiss_wellness_store"
 wellness_rag_chain = build_rag_chain(gemini_llm, wellness_vectorstore)
 brand_rag_chain = build_rag_chain(gemini_llm, brand_vectorstore)
+# 7) Tools / Agents for web search (no changes)
+# 8) Orchestrator: run_with_chain
 def run_with_chain(query: str) -> str:
     print("DEBUG: Starting run_with_chain...")
+    # 1) Classify the query
     class_result = classification_chain.invoke({"query": query})
     classification = class_result.get("text", "").strip()
     print("DEBUG: Classification =>", classification)
     # If OutOfScope => refusal => tailor => return
     if classification == "OutOfScope":
+        # Extract the main topic for the refusal message
+        topic = extract_main_topic(query)
+        print("DEBUG: Extracted Topic =>", topic)
+        # Pass the extracted topic to the refusal chain
+        refusal_text = refusal_chain.run({"topic": topic})
         final_refusal = tailor_chain.run({"response": refusal_text})
         return final_refusal.strip()
         return final_answer.strip()
     # fallback
+    refusal_text = refusal_chain.run({"topic": "this topic"})
     final_refusal = tailor_chain.run({"response": refusal_text})
     return final_refusal.strip()