class Chroma: def __init__(self) -> None: """ creates client if none, stores session of client """ import chromadb from chromadb.utils import embedding_functions import os self.DB_PATH = "./chromadb_linux/" self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb self.COLLECTION_NAME: str = "scheme" self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction( model_name=self.MODEL_NAME ) if os.path.exists(self.DB_PATH): self.client = chromadb.PersistentClient(path=self.DB_PATH) self.schemer = self.client.get_collection( name=self.COLLECTION_NAME, embedding_function=self.EMBEDDING_FUNC, ) else: print("Collection Missing, Creating New Collection") client = chromadb.PersistentClient(path=self.DB_PATH) self.schemer = client.create_collection( name=self.COLLECTION_NAME, embedding_function=self.EMBEDDING_FUNC, ) def get_collection(self): return self.schemer def add_materials(self, file_path: str) -> None: """ adds file path of PDF into embedded database """ print("hiiii") from pypdf import PdfReader as reader doc = reader(file_path) text_content: str = "" for page in doc.pages: text_content += page.extract_text() text_content.replace("\n", " ") batch_size = 1024 padding_element = "." batch_documents = [] batch_ids = [] batch_metadata = [] for i in range(0, len(text_content), batch_size): batch = text_content[i : min(i + batch_size, len(text_content))] if len(batch) < batch_size: padding_needed = batch_size - len(batch) batch = batch + str(padding_element * padding_needed) print(f"Batch {i}/{len(text_content)}") batch_documents.append(text_content) batch_ids.append(f"batch{i}{batch[0]}") batch_metadata.append({"length": len(batch)}) print("Upserting into collection") self.schemer.upsert( ids=[str(id) for id in batch_ids], metadatas=batch_metadata, documents=batch_documents, ) def encode_image(image) -> str: """ idk why u would need this """ import io import base64 byte_arr = io.BytesIO() image.save(byte_arr, format="JPEG") encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8") return encoded_image async def image_to_text(self, image) -> object: """ idk why you would need this ngl """ from openai import OpenAI import json client = OpenAI() response = client.chat.completions.create( model="gpt-4-turbo", response_format={"type": "json_object"}, messages=[ { "role": "user", "content": [ { "type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image", }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64;,{image}", "detail": "high", }, }, ], } ], ) return json.loads(response.choices[0].message.content) if __name__ == "__main__": c = Chroma() c.add_materials("data/Essentials of Programming Languages 2001.pdf")