Spaces:

chagu13
/

chagu-demo

Running

App Files Files Community

talexm commited on Dec 7, 2024

Commit

ed26242

1 Parent(s): 1d44212

news-data retrival

Browse files

Files changed (1) hide show

rag_sec/document_retriver.py +12 -36

rag_sec/document_retriver.py CHANGED Viewed

@@ -1,47 +1,23 @@
 import faiss
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 class DocumentRetriever:
     def __init__(self):
         self.documents = []
-        self.vectorizer = TfidfVectorizer()
-        self.index = None
-    def load_documents(self, source_dir):
-        from pathlib import Path
-        data_dir = Path(source_dir)
-        if not data_dir.exists():
-            print(f"Source directory not found: {source_dir}")
-            return
-        for file in data_dir.glob("*.txt"):
-            with open(file, "r", encoding="utf-8") as f:
-                self.documents.append(f.read())
-        print(f"Loaded {len(self.documents)} documents.")
-        # Create the FAISS index
-        self._build_index()
-    def _build_index(self):
-        # Generate TF-IDF vectors for documents
-        doc_vectors = self.vectorizer.fit_transform(self.documents).toarray()
-        # Create FAISS index
-        self.index = faiss.IndexFlatL2(doc_vectors.shape[1])
-        self.index.add(doc_vectors.astype(np.float32))
-    def retrieve(self, query, top_k=5):
-        if not self.index:
             return ["Document retrieval is not initialized."]
-        # Vectorize the query
-        query_vector = self.vectorizer.transform([query]).toarray().astype(np.float32)
-        # Perform FAISS search
-        distances, indices = self.index.search(query_vector, top_k)
-        # Return matching documents
-        return [self.documents[i] for i in indices[0] if i < len(self.documents)]

 import faiss
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
+from sklearn.datasets import fetch_20newsgroups
 class DocumentRetriever:
     def __init__(self):
         self.documents = []
+    def load_documents(self):
+        """Load 20 Newsgroups dataset."""
+        newsgroups_data = fetch_20newsgroups(subset='all')
+        self.documents = newsgroups_data.data
+        if not self.documents:
+            print("No documents loaded!")
+    def retrieve(self, query):
+        """Retrieve documents related to the query."""
+        if not self.documents:
             return ["Document retrieval is not initialized."]
+        # Simple keyword match (can replace with advanced semantic similarity later)
+        return [doc for doc in self.documents if query.lower() in doc.lower()]