from sklearn.datasets import fetch_20newsgroups class DocumentRetriever: def __init__(self): self.documents = [] def load_documents(self, subset_size=500): """Load a subset of 20 Newsgroups dataset.""" newsgroups_data = fetch_20newsgroups(subset='all') self.documents = newsgroups_data.data[:subset_size] # Load only the first `subset_size` documents print(f"Loaded {len(self.documents)} documents.") def retrieve(self, query): """Retrieve documents related to the query.""" if not self.documents: return ["Document retrieval is not initialized."] return [doc for doc in self.documents if query.lower() in doc.lower()]