f150 / src /index.py
Adrian Cowham
initial commit
cbdf795
#!/usr/bin/env python
import json
import logging
import os
import sys
import psycopg2
import s3fs
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import (ServiceContext, SimpleDirectoryReader, StorageContext,
SummaryIndex, get_response_synthesizer,
set_global_service_context)
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.indices.vector_store import VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.schema import IndexNode
from llama_index.vector_stores import PGVectorStore
from sqlalchemy import make_url
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
def get_embed_model():
model_kwargs = {'device': 'cpu'}
if torch.cuda.is_available():
model_kwargs['device'] = 'cuda'
if torch.backends.mps.is_available():
model_kwargs['device'] = 'mps'
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
print("Loading model...")
try:
model_norm = HuggingFaceEmbeddings(
model_name="thenlper/gte-small",
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
)
except Exception as exception:
print(f"Model not found. Loading fake model...{exception}")
exit()
print("Model loaded.")
return model_norm
def create_table(db_name, connection_string):
conn = psycopg2.connect(connection_string)
conn.autocommit = True
with conn.cursor() as c:
c.execute(f"DROP DATABASE IF EXISTS {db_name}")
c.execute(f"CREATE DATABASE {db_name}")
return
def create_vector_store():
db_name = "helm"
connection_string = "postgresql://adrian@localhost:5432/postgres"
create_table(db_name, connection_string)
url = make_url(connection_string)
vector_store = PGVectorStore.from_params(
database=db_name,
host=url.host,
password=url.password,
port=url.port,
user=url.username,
table_name="f150_manual",
embed_dim=384,
hybrid_search=True,
text_search_config="english",
)
return vector_store
def get_remote_filesystem():
AWS_KEY = "AKIAWCUHDQXX3H7PPRXN"
AWS_SECRET = "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"
s3 = s3fs.S3FileSystem(
key=AWS_KEY,
secret=AWS_SECRET,
)
return s3
def create_vector_index():
docs = SimpleDirectoryReader(input_dir="docs/chapters").load_data()
vector_store = create_vector_store()
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex.from_documents(
docs,
storage_context=storage_context,
embedding_model=None,
show_progress=True,
chunk_size=1024,
chunk_overlap=20)
return vector_index
def create_recursive_index():
doc_dir = "./docs/chapters/"
doc_summaries = {}
titles = []
for filename in os.listdir(doc_dir):
print(filename)
title = filename.split(".")[0]
titles.append(title)
docs = SimpleDirectoryReader(input_files=[f"{doc_dir}{filename}"]).load_data()
docs[0].doc_id = title
doc_summaries[title] = docs
context_window = 4096
embed_model = get_embed_model()
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo-16k")
service_context = ServiceContext.from_defaults(
llm=chatgpt,
embed_model=embed_model,
chunk_size=1024,
context_window=context_window)
s3 = get_remote_filesystem()
nodes = []
for title in titles:
print(title)
# build vector index
storage_context = StorageContext.from_defaults()
vector_index = VectorStoreIndex.from_documents(
doc_summaries[title],
service_context=service_context,
verbose=True,
storage_context=storage_context,
show_progress=True,
)
vector_index.storage_context.persist(f"f150-user-manual/recursive-agent/{title}/vector_index", fs=s3)
# build summary index
response_synthesizer = get_response_synthesizer(
response_mode="compact_accumulate", use_async=False
)
storage_context = StorageContext.from_defaults()
summary_index = DocumentSummaryIndex.from_documents(
doc_summaries[title],
service_context=service_context,
response_synthesizer=response_synthesizer,
verbose=True,
storage_context=storage_context,
show_progress=True,
)
print(summary_index.get_document_summary(title))
node = IndexNode(text=summary_index.get_document_summary(title), index_id=title)
nodes.append(node)
storage_context = StorageContext.from_defaults()
vector_index = VectorStoreIndex(
nodes,
service_context=service_context,
verbose=True,
storage_context=storage_context,
show_progress=True,)
vector_index.storage_context.persist("f150-user-manual/recursive-agent/vector_index", fs=s3)
def main():
embed_model = get_embed_model()
service_context = ServiceContext.from_defaults(embed_model=embed_model)
set_global_service_context(service_context)
create_vector_index();
create_recursive_index();
if __name__ == "__main__":
main()