Spaces:
Running
Running
from sklearn.datasets import fetch_20newsgroups | |
class DocumentRetriever: | |
def __init__(self): | |
self.documents = [] | |
def load_documents(self, subset_size=500): | |
"""Load a subset of 20 Newsgroups dataset.""" | |
newsgroups_data = fetch_20newsgroups(subset='all') | |
self.documents = newsgroups_data.data[:subset_size] # Load only the first `subset_size` documents | |
print(f"Loaded {len(self.documents)} documents.") | |
def retrieve(self, query): | |
"""Retrieve documents related to the query.""" | |
if not self.documents: | |
return ["Document retrieval is not initialized."] | |
return [doc for doc in self.documents if query.lower() in doc.lower()] | |