Spaces:

chagu13
/

chagu-demo

Running

App Files Files Community

talexm commited on 19 days ago

Commit

73321dd

•

1 Parent(s): 92c34be

update RAG query improvements

Browse files

Files changed (4) hide show

.gitignore +2 -3
falocon_api/embeddingGenerator.py +1 -1
falocon_api/embededGeneratorRAG.py +1 -1
rag_sec/rag_chagu_demo.py +78 -74

.gitignore CHANGED Viewed

@@ -1,4 +1,3 @@
 rag_sec/__pycache*
- rag_sec/__pycache__/rag_chagu_demo.*

 rag_sec/__pycache*
+falocon_api/embeddings.db
+rag_sec/__pycache__/rag_chagu_demo*

falocon_api/embeddingGenerator.py CHANGED Viewed

@@ -92,7 +92,7 @@ if __name__ == "__main__":
     embedding_generator.ingest_files(os.path.expanduser("~/data-sets/aclImdb/train/"))
     # Perform a search query
-    query = "What can be used for document search?"
     results = embedding_generator.find_most_similar(query, top_k=3)
     print("Search Results:")

     embedding_generator.ingest_files(os.path.expanduser("~/data-sets/aclImdb/train/"))
     # Perform a search query
+    query = "What can be used for document search?"#"DROP TABLE reviews; SELECT * FROM confidential_data;"#"What can be used for document search?"
     results = embedding_generator.find_most_similar(query, top_k=3)
     print("Search Results:")

falocon_api/embededGeneratorRAG.py CHANGED Viewed

@@ -109,7 +109,7 @@ if __name__ == "__main__":
     embedding_generator.ingest_files(os.path.expanduser("~/data-sets/aclImdb/train/"))
     # Perform a search query with RAG response generation
-    query = "find user comments tt0118866"
     response = embedding_generator.find_most_similar_and_generate(query)
     print("Generated Response:")

     embedding_generator.ingest_files(os.path.expanduser("~/data-sets/aclImdb/train/"))
     # Perform a search query with RAG response generation
+    query = "DROP TABLE reviews; SELECT * FROM confidential_data;"#"find user comments tt0118866"
     response = embedding_generator.find_most_similar_and_generate(query)
     print("Generated Response:")

rag_sec/rag_chagu_demo.py CHANGED Viewed

@@ -1,100 +1,104 @@
-import os
-from pathlib import Path
-from difflib import get_close_matches
 from transformers import pipeline
-class DocumentSearcher:
     def __init__(self):
-        self.documents = []
-        # Load a pre-trained model for malicious intent detection
-        self.malicious_detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
-    def load_imdb_data(self):
-        home_dir = Path(os.getenv("HOME", "/"))
-        data_dir = home_dir / "data-sets/aclImdb/train"
-        pos_dir = data_dir / "pos"
-        neg_dir = data_dir / "neg"
-        print(f"Looking for positive reviews in: {pos_dir}")
-        print(f"Looking for negative reviews in: {neg_dir}")
-        if not pos_dir.exists() or not any(pos_dir.iterdir()):
-            print("No positive reviews found.")
-        if not neg_dir.exists() or not any(neg_dir.iterdir()):
-            print("No negative reviews found.")
-        for filename in pos_dir.iterdir():
-            with open(filename, "r", encoding="utf-8") as file:
-                self.documents.append(file.read())
-        for filename in neg_dir.iterdir():
-            with open(filename, "r", encoding="utf-8") as file:
-                self.documents.append(file.read())
-        print(f"Loaded {len(self.documents)} movie reviews from IMDB dataset.")
-    def load_txt_files(self, txt_dir=None):
-        if txt_dir is None:
-            home_dir = Path(os.getenv("HOME", "/"))
-            txt_dir = home_dir / "data-sets/txt-files/"
-        if not txt_dir.exists():
-            print("No .txt files directory found.")
-            return
-        for filename in txt_dir.glob("*.txt"):
-            with open(filename, "r", encoding="utf-8") as file:
-                self.documents.append(file.read())
-        print(f"Loaded additional {len(self.documents)} documents from .txt files.")
-    def is_query_malicious(self, query):
-        # Use the pre-trained model to check if the query has malicious intent
-        result = self.malicious_detector(query)[0]
-        label = result['label']
-        score = result['score']
-        # Consider the query malicious if the sentiment is negative with high confidence
-        if label == "NEGATIVE" and score > 0.8:
-            print(f"Warning: Malicious query detected - Confidence: {score:.4f}")
-            return True
-        return False
-    def search_documents(self, query):
-        if self.is_query_malicious(query):
-            return [{"document": "ANOMALY: Query blocked due to detected malicious intent.", "similarity": 0.0}]
-        # Use fuzzy matching for normal queries
-        matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
-        if not matches:
-            return [{"document": "No matching documents found.", "similarity": 0.0}]
-        return [{"document": match[:100] + "..."} for match in matches]
-# Test the system with normal and malicious queries
-def test_document_search():
-    searcher = DocumentSearcher()
-    # Load the IMDB movie reviews
-    searcher.load_imdb_data()
-    # Load additional .txt files
-    searcher.load_txt_files()
-    # Perform a normal query
-    normal_query = "This movie had great acting and a compelling storyline."
-    normal_results = searcher.search_documents(normal_query)
-    print("Normal Query Results:")
-    for result in normal_results:
-        print(f"Document: {result['document']}")
-    # Perform a query injection attack
-    malicious_query = "DROP TABLE reviews; SELECT * FROM confidential_data;"
-    attack_results = searcher.search_documents(malicious_query)
-    print("\nMalicious Query Results:")
-    for result in attack_results:
-        print(f"Document: {result['document']}")
 if __name__ == "__main__":
-    test_document_search()

 from transformers import pipeline
+from difflib import get_close_matches
+from pathlib import Path
+import os
+class BadQueryDetector:
     def __init__(self):
+        self.detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+    def is_bad_query(self, query):
+        result = self.detector(query)[0]
+        label = result["label"]
+        score = result["score"]
+        # Mark queries as malicious or bad if negative sentiment with high confidence
+        if label == "NEGATIVE" and score > 0.8:
+            print(f"Detected malicious query with high confidence ({score:.4f}): {query}")
+            return True
+        return False
+class QueryTransformer:
+    def transform_query(self, query):
+        # Simple transformation example: rephrasing and clarifying
+        # In practice, this could involve more sophisticated models like T5
+        if "DROP TABLE" in query or "SELECT *" in query:
+            return "Your query appears to contain SQL injection elements. Please rephrase."
+        # Add more sophisticated handling here
+        return query
+class DocumentRetriever:
+    def __init__(self):
+        self.documents = []
+    def load_documents(self, source_dir):
+        data_dir = Path(source_dir)
+        if not data_dir.exists():
+            print(f"Source directory not found: {source_dir}")
+            return
+        for file in data_dir.glob("*.txt"):
+            with open(file, "r", encoding="utf-8") as f:
+                self.documents.append(f.read())
+        print(f"Loaded {len(self.documents)} documents.")
+    def retrieve(self, query):
+        matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
+        return matches if matches else ["No matching documents found."]
+class SemanticResponseGenerator:
+    def __init__(self):
+        self.generator = pipeline("text-generation", model="gpt2")
+    def generate_response(self, retrieved_docs):
+        # Generate a semantic response using retrieved documents
+        combined_docs = " ".join(retrieved_docs[:2])  # Use top 2 matches for response
+        response = self.generator(f"Based on the following information: {combined_docs}", max_length=100)
+        return response[0]["generated_text"]
+class DocumentSearchSystem:
+    def __init__(self):
+        self.detector = BadQueryDetector()
+        self.transformer = QueryTransformer()
+        self.retriever = DocumentRetriever()
+        self.response_generator = SemanticResponseGenerator()
+    def process_query(self, query):
+        if self.detector.is_bad_query(query):
+            return {"status": "rejected", "message": "Query blocked due to detected malicious intent."}
+        transformed_query = self.transformer.transform_query(query)
+        retrieved_docs = self.retriever.retrieve(transformed_query)
+        if "No matching documents found." in retrieved_docs:
+            return {"status": "no_results", "message": "No relevant documents found for your query."}
+        response = self.response_generator.generate_response(retrieved_docs)
+        return {"status": "success", "response": response}
+# Test the enhanced system
+def test_system():
+    system = DocumentSearchSystem()
+    system.retriever.load_documents("/path/to/documents")
+    # Test with a normal query
+    normal_query = "Tell me about great acting performances."
+    normal_result = system.process_query(normal_query)
+    print("\nNormal Query Result:")
+    print(normal_result)
+    # Test with a malicious query
+    malicious_query = "DROP TABLE users; SELECT * FROM sensitive_data;"
+    malicious_result = system.process_query(malicious_query)
+    print("\nMalicious Query Result:")
+    print(malicious_result)
 if __name__ == "__main__":
+    test_system()