import os import dotenv import openai import pinecone from langchain.document_loaders import Docx2txtLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import hashlib from time import sleep from helper import append_file import json ## Read the environment variables dotenv.load_dotenv('.env') openai.api_key = os.getenv('OPENAI_API_KEY') embedding_model = os.getenv('EMBEDDING_ENGINE') debug_mode = os.getenv('DEBUG') file_path = os.getenv('GAME_DOCS_FOLDER') file_name = os.getenv('GAME_DOCS_FILE') game_index = os.getenv('GAME_ID_INDEX') pinecone_api_key = os.getenv('PINECONE_API_KEY') pinecone_env = os.getenv('PINECONE_REGION') pinecone_index = os.getenv('PINECONE_INDEX') pinecone.init( api_key=pinecone_api_key, environment=pinecone_env ) # check if index_name' index already exists (only create index if not) if pinecone_index not in pinecone.list_indexes(): pinecone.create_index(pinecone_index, dimension=1536, metric="cosine", pods=1, pod_type="p1.x1") sleep(3) vector_db = pinecone.Index(pinecone_index) def perform_embedding(doclist): payload=list() m = hashlib.md5() # convert file_name to unique ID m.update(file_name.encode('utf-8')) game_id = m.hexdigest()[:12] json_val = {"game_id":game_id, "game_file":file_name} append_file(f"{file_path}/{game_index}",json.dumps(json_val)) for i in range(len(doclist)): unique_id = game_id + "-" + str(i) content = doclist[i].page_content content = content.encode(encoding='ASCII',errors='ignore').decode() response = openai.Embedding.create(model=embedding_model, input=content) metadata = {'game_id': game_id, 'split_count': i, 'text': content} vector = response['data'][0]['embedding'] payload.append((unique_id, vector, metadata)) return payload def load_split_document(): loader = Docx2txtLoader(file_path + "/" + file_name) word_doc_data = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.split_documents(word_doc_data) if debug_mode == 'True': print("Total count of splits created: " + str(len(docs))) return docs def upload_game_docs(): docs = load_split_document() payload = perform_embedding(docs) vector_db.upsert(payload) if __name__ == '__main__': upload_game_docs()