Spaces:
Sleeping
Sleeping
from aimakerspace.text_utils import TextFileLoader, CharacterTextSplitter | |
from aimakerspace.vectordatabase import VectorDatabase | |
from aimakerspace.openai_utils.prompts import ( | |
UserRolePrompt, | |
SystemRolePrompt, | |
AssistantRolePrompt, | |
) | |
from aimakerspace.openai_utils.chatmodel import ChatOpenAI | |
import asyncio | |
import nest_asyncio | |
nest_asyncio.apply() | |
TEXT_DOCUMENTS = [ | |
"data/KingLear.txt", | |
] | |
RAQA_PROMPT_TEMPLATE = """ | |
Use the provided context to answer the user's query. | |
You may not answer the user's query unless there is specific context in the following text. | |
If you do not know the answer, or cannot answer, please respond with "I don't know". | |
Context: | |
{context} | |
""" | |
raqa_prompt = SystemRolePrompt(RAQA_PROMPT_TEMPLATE) | |
USER_PROMPT_TEMPLATE = """ | |
User Query: | |
{user_query} | |
""" | |
user_prompt = UserRolePrompt(USER_PROMPT_TEMPLATE) | |
class RetrievalAugmentedQAPipeline: | |
def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None: | |
self.llm = llm | |
self.vector_db_retriever = vector_db_retriever | |
def run_pipeline(self, client, user_query: str) -> str: | |
context_list = self.vector_db_retriever.search_by_text(user_query, k=4) | |
context_prompt = "" | |
for context in context_list: | |
context_prompt += context[0] + "\n" | |
formatted_system_prompt = raqa_prompt.create_message(context=context_prompt) | |
formatted_user_prompt = user_prompt.create_message(user_query=user_query) | |
return self.llm.run(client, [formatted_system_prompt, formatted_user_prompt]) | |
def _split_documents(): | |
split_documents = [] | |
for doc in TEXT_DOCUMENTS: | |
# Load the text file | |
loader = TextFileLoader(doc) | |
documents = loader.load_documents() | |
# Split the text file into characters | |
splitter = CharacterTextSplitter() | |
split_documents.extend(splitter.split_texts(documents)) | |
return split_documents | |
def _build_vector_db(): | |
vector_db = VectorDatabase() | |
split_documents = _split_documents() | |
vector_db = asyncio.run(vector_db.abuild_from_list(split_documents)) | |
return vector_db | |
# def retrieval_augmented_qa_pipeline(client): | |
# vector_db = _build_vector_db() | |
# pipeline = RetrievalAugmentedQAPipeline( | |
# llm=client, | |
# vector_db_retriever=vector_db) | |
# return pipeline | |