|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.docstore.document import Document |
|
import os |
|
from rag import Rag |
|
|
|
pdf_folder_path = 'files' |
|
|
|
def get_documents_from_path(pdf_folder_path: str = pdf_folder_path) -> list: |
|
documents = [] |
|
for pdf_file in os.listdir(pdf_folder_path): |
|
if pdf_file.endswith('.pdf'): |
|
loader = PyPDFLoader(os.path.join(pdf_folder_path, pdf_file)) |
|
pdf_documents = loader.load() |
|
file_name_without_extension = os.path.splitext(pdf_file)[0] |
|
for doc in pdf_documents: |
|
documents.append(Document(page_content=doc.page_content, metadata={"source": file_name_without_extension})) |
|
|
|
return documents |
|
|
|
if __name__ == "__main__": |
|
try: |
|
rag_llm = Rag() |
|
documents = get_documents_from_path() |
|
rag_llm.storeDocumentsInVectorstore(documents) |
|
print("Store PDFS Completed") |
|
|
|
except Exception as e: |
|
print(e) |