Spaces:
Running
Running
import faiss | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import numpy as np | |
from sklearn.datasets import fetch_20newsgroups | |
class DocumentRetriever: | |
def __init__(self): | |
self.documents = [] | |
def load_documents(self): | |
"""Load 20 Newsgroups dataset.""" | |
newsgroups_data = fetch_20newsgroups(subset='all') | |
self.documents = newsgroups_data.data | |
if not self.documents: | |
print("No documents loaded!") | |
def retrieve(self, query): | |
"""Retrieve documents related to the query.""" | |
if not self.documents: | |
return ["Document retrieval is not initialized."] | |
# Simple keyword match (can replace with advanced semantic similarity later) | |
return [doc for doc in self.documents if query.lower() in doc.lower()] | |