chagu-demo / rag_sec /document_retriver.py
talexm
news-data retrival
ed26242
raw
history blame
814 Bytes
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups
class DocumentRetriever:
def __init__(self):
self.documents = []
def load_documents(self):
"""Load 20 Newsgroups dataset."""
newsgroups_data = fetch_20newsgroups(subset='all')
self.documents = newsgroups_data.data
if not self.documents:
print("No documents loaded!")
def retrieve(self, query):
"""Retrieve documents related to the query."""
if not self.documents:
return ["Document retrieval is not initialized."]
# Simple keyword match (can replace with advanced semantic similarity later)
return [doc for doc in self.documents if query.lower() in doc.lower()]