ThaiCodex / storePDF.py
microhum's picture
initial commit
220a370
raw
history blame contribute delete
971 Bytes
from langchain_community.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
import os
from rag import Rag
pdf_folder_path = 'files'
def get_documents_from_path(pdf_folder_path: str = pdf_folder_path) -> list:
documents = []
for pdf_file in os.listdir(pdf_folder_path):
if pdf_file.endswith('.pdf'):
loader = PyPDFLoader(os.path.join(pdf_folder_path, pdf_file))
pdf_documents = loader.load()
file_name_without_extension = os.path.splitext(pdf_file)[0]
for doc in pdf_documents:
documents.append(Document(page_content=doc.page_content, metadata={"source": file_name_without_extension}))
return documents
if __name__ == "__main__":
try:
rag_llm = Rag()
documents = get_documents_from_path()
rag_llm.storeDocumentsInVectorstore(documents)
print("Store PDFS Completed")
except Exception as e:
print(e)