ChatBotAgenticRAG_dup

Sleeping

App Files Files Community

Phoenix21 commited on 23 days ago

Commit

74221f2

verified ·

1 Parent(s): 774c0b8

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +52 -47

pipeline.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import getpass
-import spacy  # Import spaCy for NER functionality
 import pandas as pd
 from typing import Optional
 from langchain.docstore.document import Document
@@ -8,17 +8,11 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
-import subprocess  # Import subprocess to run shell commands
-from langchain.llms.base import LLM  # Import LLM
-# Import the functions from respective chain files
-from classification_chain import get_classification_chain
-from refusal_chain import get_refusal_chain
-from tailor_chain import get_tailor_chain
-from cleaner_chain import get_cleaner_chain
 # Mistral Client Setup
-from mistralai import Mistral  # Import the Mistral client
 from pydantic_ai import Agent  # Import Pydantic AI's Agent
 # Initialize Mistral API client
@@ -28,7 +22,7 @@ client = Mistral(api_key=mistral_api_key)
 # Initialize Pydantic AI Agent (for text validation)
 pydantic_agent = Agent('mistral:mistral-large-latest', result_type=str)
-# Load spaCy model for NER and download the spaCy model if not already installed
 def install_spacy_model():
     try:
         spacy.load("en_core_web_sm")
@@ -38,38 +32,53 @@ def install_spacy_model():
         subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
         print("spaCy model 'en_core_web_sm' downloaded successfully.")
-# Call the function to install the spaCy model if needed
 install_spacy_model()
-# Load the spaCy model globally
 nlp = spacy.load("en_core_web_sm")
 # Function to moderate text using Mistral moderation API
 def moderate_text(query: str) -> str:
     """
     Classifies the query as harmful or not using Mistral Moderation via Mistral API.
     Returns "OutOfScope" if harmful, otherwise returns the original query.
     """
-    # Validate the text type using Pydantic AI's Agent
     try:
-        # Use Pydantic AI agent to ensure correct text type
-        pydantic_agent.run_sync(query)
     except Exception as e:
-        print(f"Error validating text with Pydantic AI: {e}")
         return "Invalid text format."
-    # Use the moderation API to evaluate if the query is harmful
     response = client.classifiers.moderate_chat(
         model="mistral-moderation-latest",
-        inputs=[
-            {"role": "user", "content": query},
-        ],
     )
-    # Extracting category scores from response
     categories = response['results'][0]['categories']
-    # Check if harmful content is flagged in moderation categories
     if categories.get("violence_and_threats", False) or \
        categories.get("hate_and_discrimination", False) or \
        categories.get("dangerous_and_criminal_content", False) or \
@@ -78,7 +87,7 @@ def moderate_text(query: str) -> str:
     return query
-# 3) build_or_load_vectorstore (no changes)
 def build_or_load_vectorstore(csv_path: str, store_dir: str) -> FAISS:
     if os.path.exists(store_dir):
         print(f"DEBUG: Found existing FAISS store at '{store_dir}'. Loading...")
@@ -107,7 +116,7 @@ def build_or_load_vectorstore(csv_path: str, store_dir: str) -> FAISS:
         vectorstore.save_local(store_dir)
         return vectorstore
-# 4) Build RAG chain for Gemini (no changes)
 def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
     class GeminiLangChainLLM(LLM):
         def _call(self, prompt: str, stop: Optional[list] = None, **kwargs) -> str:
@@ -128,13 +137,18 @@ def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
     )
     return rag_chain
-# 5) Initialize all the separate chains
-classification_chain = get_classification_chain()
-refusal_chain = get_refusal_chain()  # Refusal chain will now use dynamic topic
-tailor_chain = get_tailor_chain()
-cleaner_chain = get_cleaner_chain()
-# 6) Build our vectorstores + RAG chains
 wellness_csv = "AIChatbot.csv"
 brand_csv = "BrandAI.csv"
 wellness_store_dir = "faiss_wellness_store"
@@ -147,7 +161,7 @@ gemini_llm = LiteLLMModel(model_id="gemini/gemini-pro", api_key=os.environ.get("
 wellness_rag_chain = build_rag_chain(gemini_llm, wellness_vectorstore)
 brand_rag_chain = build_rag_chain(gemini_llm, brand_vectorstore)
-# 7) Tools / Agents for web search (no changes)
 search_tool = DuckDuckGoSearchTool()
 web_agent = CodeAgent(tools=[search_tool], model=gemini_llm)
 managed_web_agent = ManagedAgent(agent=web_agent, name="web_search", description="Runs web search for you.")
@@ -159,32 +173,25 @@ def do_web_search(query: str) -> str:
     response = manager_agent.run(search_query)
     return response
-# 8) Orchestrator: run_with_chain
 def run_with_chain(query: str) -> str:
     print("DEBUG: Starting run_with_chain...")
-    # 1) Moderate the query for harmful content
     moderated_query = moderate_text(query)
     if moderated_query == "OutOfScope":
         return "Sorry, this query contains harmful or inappropriate content."
-    # 2) Classify the query
     class_result = classification_chain.invoke({"query": moderated_query})
     classification = class_result.get("text", "").strip()
     print("DEBUG: Classification =>", classification)
-    # If OutOfScope => refusal => tailor => return
     if classification == "OutOfScope":
-        # Extract the main topic for the refusal message
-        topic = extract_main_topic(moderated_query)
-        print("DEBUG: Extracted Topic =>", topic)
-        # Pass the extracted topic to the refusal chain
-        refusal_text = refusal_chain.run({"topic": topic})
         final_refusal = tailor_chain.run({"response": refusal_text})
         return final_refusal.strip()
-    # If Wellness => wellness RAG => if insufficient => web => unify => tailor
     if classification == "Wellness":
         rag_result = wellness_rag_chain({"query": moderated_query})
         csv_answer = rag_result["result"].strip()
@@ -200,7 +207,6 @@ def run_with_chain(query: str) -> str:
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
-    # If Brand => brand RAG => tailor => return
     if classification == "Brand":
         rag_result = brand_rag_chain({"query": moderated_query})
         csv_answer = rag_result["result"].strip()
@@ -208,7 +214,6 @@ def run_with_chain(query: str) -> str:
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
-    # fallback
     refusal_text = refusal_chain.run({"topic": "this topic"})
     final_refusal = tailor_chain.run({"response": refusal_text})
     return final_refusal.strip()

 import os
 import getpass
+import spacy
 import pandas as pd
 from typing import Optional
 from langchain.docstore.document import Document
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
+import subprocess
+from langchain.llms.base import LLM
 # Mistral Client Setup
+from mistralai import Mistral
 from pydantic_ai import Agent  # Import Pydantic AI's Agent
 # Initialize Mistral API client
 # Initialize Pydantic AI Agent (for text validation)
 pydantic_agent = Agent('mistral:mistral-large-latest', result_type=str)
+# Load spaCy model for NER and download it if not already installed
 def install_spacy_model():
     try:
         spacy.load("en_core_web_sm")
         subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
         print("spaCy model 'en_core_web_sm' downloaded successfully.")
 install_spacy_model()
 nlp = spacy.load("en_core_web_sm")
+# Function to extract the main topic from the query using spaCy NER
+def extract_main_topic(query: str) -> str:
+    """
+    Extracts the main topic from the user's query using spaCy's NER.
+    Returns the first named entity or noun found in the query.
+    """
+    doc = nlp(query)
+    # Try to extract the main topic as a named entity (person, product, etc.)
+    main_topic = None
+    for ent in doc.ents:
+        # Filter for specific entity types (you can adjust this based on your needs)
+        if ent.label_ in ["ORG", "PRODUCT", "PERSON", "GPE", "TIME"]:  # Add more entity labels as needed
+            main_topic = ent.text
+            break
+    # If no named entity found, fallback to extracting the first noun or proper noun
+    if not main_topic:
+        for token in doc:
+            if token.pos_ in ["NOUN", "PROPN"]:  # Extract first noun or proper noun
+                main_topic = token.text
+                break
+    # Return the extracted topic or a fallback value if no topic is found
+    return main_topic if main_topic else "this topic"
 # Function to moderate text using Mistral moderation API
 def moderate_text(query: str) -> str:
     """
     Classifies the query as harmful or not using Mistral Moderation via Mistral API.
     Returns "OutOfScope" if harmful, otherwise returns the original query.
     """
     try:
+        pydantic_agent.run_sync(query)  # Validate input
     except Exception as e:
+        print(f"Error validating text: {e}")
         return "Invalid text format."
     response = client.classifiers.moderate_chat(
         model="mistral-moderation-latest",
+        inputs=[{"role": "user", "content": query}]
     )
     categories = response['results'][0]['categories']
     if categories.get("violence_and_threats", False) or \
        categories.get("hate_and_discrimination", False) or \
        categories.get("dangerous_and_criminal_content", False) or \
     return query
+# Build or load vectorstore function
 def build_or_load_vectorstore(csv_path: str, store_dir: str) -> FAISS:
     if os.path.exists(store_dir):
         print(f"DEBUG: Found existing FAISS store at '{store_dir}'. Loading...")
         vectorstore.save_local(store_dir)
         return vectorstore
+# Build RAG chain for Gemini (no changes)
 def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
     class GeminiLangChainLLM(LLM):
         def _call(self, prompt: str, stop: Optional[list] = None, **kwargs) -> str:
     )
     return rag_chain
+# Initialize all the separate chains
+from classification_chain import get_classification_chain
+from refusal_chain import get_refusal_chain
+from tailor_chain import get_tailor_chain
+from cleaner_chain import get_cleaner_chain
+classification_chain = get_classification_chain()  # Ensure this function is imported correctly
+refusal_chain = get_refusal_chain()  # Ensure this function is imported correctly
+tailor_chain = get_tailor_chain()  # Ensure this function is imported correctly
+cleaner_chain = get_cleaner_chain()  # Ensure this function is imported correctly
+# Build our vectorstores + RAG chains
 wellness_csv = "AIChatbot.csv"
 brand_csv = "BrandAI.csv"
 wellness_store_dir = "faiss_wellness_store"
 wellness_rag_chain = build_rag_chain(gemini_llm, wellness_vectorstore)
 brand_rag_chain = build_rag_chain(gemini_llm, brand_vectorstore)
+# Tools / Agents for web search
 search_tool = DuckDuckGoSearchTool()
 web_agent = CodeAgent(tools=[search_tool], model=gemini_llm)
 managed_web_agent = ManagedAgent(agent=web_agent, name="web_search", description="Runs web search for you.")
     response = manager_agent.run(search_query)
     return response
+# Orchestrator: run_with_chain
 def run_with_chain(query: str) -> str:
     print("DEBUG: Starting run_with_chain...")
+    # Moderate the query for harmful content
     moderated_query = moderate_text(query)
     if moderated_query == "OutOfScope":
         return "Sorry, this query contains harmful or inappropriate content."
+    # Classify the query
     class_result = classification_chain.invoke({"query": moderated_query})
     classification = class_result.get("text", "").strip()
     print("DEBUG: Classification =>", classification)
     if classification == "OutOfScope":
+        refusal_text = refusal_chain.run({"topic": "this topic"})
         final_refusal = tailor_chain.run({"response": refusal_text})
         return final_refusal.strip()
     if classification == "Wellness":
         rag_result = wellness_rag_chain({"query": moderated_query})
         csv_answer = rag_result["result"].strip()
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
     if classification == "Brand":
         rag_result = brand_rag_chain({"query": moderated_query})
         csv_answer = rag_result["result"].strip()
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
     refusal_text = refusal_chain.run({"topic": "this topic"})
     final_refusal = tailor_chain.run({"response": refusal_text})
     return final_refusal.strip()