Spaces:
Running
Running
import faiss | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import numpy as np | |
class DocumentRetriever: | |
def __init__(self): | |
self.documents = [] | |
self.vectorizer = TfidfVectorizer() | |
self.index = None | |
def load_documents(self, source_dir): | |
from pathlib import Path | |
data_dir = Path(source_dir) | |
if not data_dir.exists(): | |
print(f"Source directory not found: {source_dir}") | |
return | |
for file in data_dir.glob("*.txt"): | |
with open(file, "r", encoding="utf-8") as f: | |
self.documents.append(f.read()) | |
print(f"Loaded {len(self.documents)} documents.") | |
# Create the FAISS index | |
self._build_index() | |
def _build_index(self): | |
# Generate TF-IDF vectors for documents | |
doc_vectors = self.vectorizer.fit_transform(self.documents).toarray() | |
# Create FAISS index | |
self.index = faiss.IndexFlatL2(doc_vectors.shape[1]) | |
self.index.add(doc_vectors.astype(np.float32)) | |
def retrieve(self, query, top_k=5): | |
if not self.index: | |
return ["Document retrieval is not initialized."] | |
# Vectorize the query | |
query_vector = self.vectorizer.transform([query]).toarray().astype(np.float32) | |
# Perform FAISS search | |
distances, indices = self.index.search(query_vector, top_k) | |
# Return matching documents | |
return [self.documents[i] for i in indices[0] if i < len(self.documents)] | |