Spaces:
Sleeping
Sleeping
File size: 1,889 Bytes
d1d1d6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import fitz
import os
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from dotenv import load_dotenv
import os
load_dotenv()
pc = Pinecone(api_key=os.getenv('PINECONE_KEY'))
index_name = "askmeaboutrag"
index = pc.Index(index_name)
model = SentenceTransformer('all-MiniLM-L6-v2')
def extract_pages_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
pages = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text("text")
pages.append(text)
return pages
def store_document_in_pinecone(document_id, pages, title, model):
for page_number, page_text in enumerate(pages):
embedding = model.encode(page_text)
index.upsert(
vectors=[
{
"id": f'{document_id}_page_{page_number}',
"values": embedding,
"metadata": {
"document_id": document_id,
"page_number": page_number,
"text": page_text,
"title": title,
}
}
],
)
print(f"Stored {len(pages)} pages for document: {document_id}")
def process_pdfs_in_folder(folder_path):
for i, filename in enumerate(os.listdir(folder_path)):
if filename.endswith('.pdf'):
pdf_path = os.path.join(folder_path, filename)
document_id = str(i+1)
print(f"Processing {filename} with document_id: {document_id}")
pages = extract_pages_from_pdf(pdf_path)
file_name_without_extension = os.path.splitext(filename)[0]
store_document_in_pinecone(document_id, pages, file_name_without_extension, model)
print("Stored Completed")
folder_path = 'files'
process_pdfs_in_folder(folder_path)
|