rohan13's picture
Prompt changes for formatting response
4604a12
import os
import pickle
import langchain
import faiss
from langchain import HuggingFaceHub, PromptTemplate
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
StringPromptTemplate
)
from langchain.output_parsers import PydanticOutputParser
from langchain.tools.json.tool import JsonSpec
from typing import List, Union, Callable
from langchain.schema import AgentAction, AgentFinish
import re
from langchain.text_splitter import CharacterTextSplitter
from custom_faiss import MyFAISS
from langchain.cache import InMemoryCache
from langchain.chat_models import ChatGooglePalm
from langchain.document_loaders import JSONLoader
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser, BaseMultiActionAgent
from langchain.tools import StructuredTool
from langchain.chains import create_tagging_chain
from typing import List, Tuple, Any, Union
from langchain.schema import AgentAction, AgentFinish
from pydantic import BaseModel, Field
from typing import Optional
class ToolArgsSchema(BaseModel):
student_name: Optional[str] = Field(description="The name of the student")
question: str = Field(description="The question being asked")
question_type: str = Field(description="The type of question being asked")
interest: Optional[str] = Field(description="The interest of the student")
class Config:
schema_extra = {
"required": ["question", "question_type"]
}
langchain.llm_cache = InMemoryCache()
model_name = "GPT-4"
pickle_file = "_vs.pkl"
index_file = "_vs.index"
models_folder = "models/"
os.environ["LANGCHAIN_TRACING"] = "true"
discussions_file_path = "discussion_entries.json"
llm = OpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, verbose=True)
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
chat_history = []
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10)
vectorstore_index = None
agent_prompt = """
I am the LLM AI canvas discussion grading assistant.
I can answer two types of questions: grade-based questions and interest-based questions.
Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. ALWAYS return total score when it is grading based question.
Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
You have access to the following tools:
{tools}
Use the following format:
Question: the input question you must answer
Thought: you should always think about type of question it is
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin!
Question: {input}
{agent_scratchpad}
"""
# Set up a prompt template
class CustomPromptTemplate(StringPromptTemplate):
# The template to use
template: str
############## NEW ######################
# The list of tools available
tools_getter: Callable
def format(self, **kwargs) -> str:
# Get the intermediate steps (AgentAction, Observation tuples)
# Format them in a particular way
intermediate_steps = kwargs.pop("intermediate_steps")
thoughts = ""
for action, observation in intermediate_steps:
thoughts += action.log
thoughts += f"\nObservation: {observation}\nThought: "
# Set the agent_scratchpad variable to that value
kwargs["agent_scratchpad"] = thoughts
############## NEW ######################
tools = self.tools_getter(kwargs["input"])
# Create a tools variable from the list of tools provided
kwargs["tools"] = "\n".join(
[f"{tool.name}: {tool.description}" for tool in tools]
)
# Create a list of tool names for the tools provided
kwargs["tool_names"] = ", ".join([tool.name for tool in tools])
return self.template.format(**kwargs)
class CustomOutputParser(AgentOutputParser):
def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
print("llm_output")
print(llm_output)
# Check if agent should finish
if "Final Answer:" in llm_output:
return AgentFinish(
# Return values is generally always a dictionary with a single `output` key
# It is not recommended to try anything else at the moment :)
return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
log=llm_output,
)
# Parse out the action and action input
regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
match = re.search(regex, llm_output, re.DOTALL)
if not match:
raise ValueError(f"Could not parse LLM output: `{llm_output}`")
action = match.group(1).strip()
action_input = match.group(2)
# Return the action and action input
return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
system_template = """
I am the LLM AI canvas discussion grading assistant.
I can answer two types of questions: grade-based questions and interest-based questions.
Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns.
Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
To grade student discussions, I will follow the rubric below.
Student Post
3 points: Post includes 8 nouns and text describing how these nouns relate to the student.
2 points: Student's post includes 8 nouns but does not offer how those nouns relate to the student.
1 point: Student's post has significant missing details.
0 points: The student does not provide an initial post, or otherwise does not follow assignment instructions.
Response to Others
3 points: Student responds to at least 3 other student discussion threads AND responds to questions asked of them. Student posts insightful comments that prompt on target discussion. These posts also avoid throw away comments such as I agree, Me too, Good idea.
2 points: Student was notably lacking in one criterion.
1 point: Student was notably lacking in two criteria.
0 points: The student does not interact in the threads of other students.
I will be able to identify each student by name, and I will be able to share their likings, interests, and other characteristics. I will also be able to filter out students based on their interests.
I will not deviate from the grading scheme. I will grade each discussion entry and reply carefully, and I will share the grades of all individuals by name on the basis of the rubric. I will ALWAYS return total score when it is grading based question.
The discussions and their replies are in following format:
Student Post: Student Name
Reply to: Another Student Discussion ID
Your answer to grade based questions should be in following format:
Student Post: X points
Response to Others: X points
Total: X points
Following are the relevant discussions to grade or answer the interest based questions
----------------
Discussions:
{context}"""
messages = [
SystemMessagePromptTemplate.from_template(system_template),
HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
def set_model_and_embeddings():
global chat_history
# set_model(model)
# set_embeddings(model)
chat_history = []
def set_embeddings(model):
global embeddings
if model == "GPT-3.5" or model == "GPT-4":
print("Loading OpenAI embeddings")
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
elif model == "Flan UL2" or model == "Flan T5":
print("Loading Hugging Face embeddings")
embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")
def get_search_index():
global vectorstore_index, model_name
if os.path.isfile(get_file_path(model_name, pickle_file)) and os.path.isfile(
get_file_path(model_name, index_file)) and os.path.getsize(get_file_path(model_name, pickle_file)) > 0:
# Load index from pickle file
with open(get_file_path(model_name, pickle_file), "rb") as f:
# search_index = Chroma(persist_directory=models_folder, embedding_function=embeddings)
search_index = pickle.load(f)
print("Loaded index")
else:
search_index = create_index(model_name)
print("Created index")
vectorstore_index = search_index
return search_index
def create_index(model):
source_chunks = create_chunk_documents()
search_index = search_index_from_docs(source_chunks)
# search_index.persist()
faiss.write_index(search_index.index, get_file_path(model, index_file))
# Save index to pickle file
with open(get_file_path(model, pickle_file), "wb") as f:
pickle.dump(search_index, f)
return search_index
def get_file_path(model, file):
# If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
if model == "GPT-3.5" or model == "GPT-4":
return models_folder + "openai" + file
else:
return models_folder + "hf" + file
def search_index_from_docs(source_chunks):
# print("source chunks: " + str(len(source_chunks)))
# print("embeddings: " + str(embeddings))
search_index = MyFAISS.from_documents(source_chunks, embeddings)
return search_index
def get_html_files():
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
document_list = loader.load()
for document in document_list:
document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
return document_list
def metadata_func(record: dict, metadata: dict) -> dict:
metadata["name"] = record.get("name")
return metadata
def get_json_file():
global discussions_file_path
loader = JSONLoader(
file_path=discussions_file_path,
jq_schema='.[]', metadata_func=metadata_func, content_key="message")
return loader.load()
def fetch_data_for_embeddings():
# document_list = get_text_files()
document_list = get_html_files()
# document_list = get_json_file()
print("document list: " + str(len(document_list)))
return document_list
def get_text_files():
loader = DirectoryLoader('docs', glob="**/*.txt", loader_cls=TextLoader, recursive=True)
document_list = loader.load()
return document_list
def create_chunk_documents():
sources = fetch_data_for_embeddings()
splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)
source_chunks = splitter.split_documents(sources)
print("chunks: " + str(len(source_chunks)))
return sources
def get_qa_chain(vectorstore_index, question, metadata):
global llm, model_name
print(llm)
filter_dict = {"name": metadata.student_name}
# embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
# compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
retriever = get_retriever(filter_dict, vectorstore_index, metadata)
print(retriever.get_relevant_documents(question))
chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
verbose=True, get_chat_history=get_chat_history,
combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
return chain
def get_retriever(filter_dict, vectorstore_index, metadata):
if metadata.question_type == "grade-based":
retriever = vectorstore_index.as_retriever(search_type='mmr',
search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10,
'filter': filter_dict})
else:
retriever = vectorstore_index.as_retriever(search_type='mmr',
search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10})
return retriever
def get_chat_history(inputs) -> str:
res = []
for human, ai in inputs:
res.append(f"Human:{human}\nAI:{ai}")
return "\n".join(res)
def generate_answer(question, metadata: ToolArgsSchema) -> str:
# print("filter: " + filter)
global chat_history, vectorstore_index
chain = get_qa_chain(vectorstore_index, question, metadata)
result = chain(
{"question": question, "chat_history": chat_history})
chat_history.extend([(question, result["answer"])])
sources = []
print(result)
for document in result['source_documents']:
source = document.metadata['source']
sources.append(source.split('/')[-1].split('.')[0])
print(sources)
source = ',\n'.join(set(sources))
# return result['answer'] + '\nSOURCES: ' + source
return result['answer']
def get_question_type(question):
parser = PydanticOutputParser(pydantic_object=ToolArgsSchema)
prompt_template = """I can answer two types of questions: grade-based questions and interest-based questions.
Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns.
Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
Question: {question}
Find following information about the question asked. Return Optional empty if the information is not available.:
Format instructions: {format_instructions}"""
llm = OpenAI(temperature=0)
prompt = PromptTemplate(template=prompt_template, input_variables=["question"], output_parser=parser, partial_variables={"format_instructions": parser.get_format_instructions()})
llm_chain = LLMChain(
llm=llm,
prompt=prompt,
)
output = llm_chain.run(question)
output = parser.parse(output)
output = generate_answer(question, output)
return output
# class FakeAgent(BaseMultiActionAgent):
# """Fake Custom Agent."""
#
# @property
# def input_keys(self):
# return ["input"]
#
# def plan(
# self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
# ) -> Union[List[AgentAction], AgentFinish]:
# print("input keys")
# print(self.input_keys)
# print("intermediate steps")
# print(intermediate_steps)
# print("kwargs")
# print(kwargs)
#
# """Given input, decided what to do.
#
# Args:
# intermediate_steps: Steps the LLM has taken to date,
# along with observations
# **kwargs: User inputs.
#
# Returns:
# Action specifying what tool to use.
# """
# if len(intermediate_steps) == 0:
# first_action = AgentAction(tool="question type", tool_input=kwargs["input"], log="")
# print("first action")
# print(first_action)
# second_action = AgentAction(tool="Grade",tool_input=kwargs["input"], log="")
# print("second action")
# print(second_action)
# return [
# first_action,
# second_action,
# ]
# else:
# return AgentFinish(return_values={"output": "bar"}, log="")
#
# async def aplan(
# self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
# ) -> Union[List[AgentAction], AgentFinish]:
# """Given input, decided what to do.
#
# Args:
# intermediate_steps: Steps the LLM has taken to date,
# along with observations
# **kwargs: User inputs.
#
# Returns:
# Action specifying what tool to use.
# """
# if len(intermediate_steps) == 0:
# return [
# AgentAction(tool="question type", tool_input=kwargs["input"], log=""),
# AgentAction(tool="Grade",
# tool_input={
# "student_name": kwargs["student_name"],
# "question": kwargs["question"],
# "question_type": kwargs["question_type"],
# "interest": kwargs["interest"]
# }, log=""),
# ]
# else:
# return AgentFinish(return_values={"output": "bar"}, log="")
#
#
# schema = {
# "properties": {
# "student_name" : {"type": "string", "description": "The name of the student"},
# "question": {"type": "string", "description": "The question being asked"},
# "question type" : {"type": "string",
# "enum": ["student grades", "student specific", "interest specific"],
# "description": "The type of question being asked"},
# "interest" : {"type": "string", "description": "The interest of the student"},
# },
# "required": ["question", "question type"]
# }
# def get_tagging_chain(question)-> str:
# global schema
# chain = create_tagging_chain(schema, llm)
# first_answer = chain.run(question)
# print("first answer:")
# print(first_answer)
# return first_answer
#
#
# def get_grading_agent():
#
# tools = [
# Tool(
# name="question type",
# func=get_tagging_chain,
# description="Useful when you need to understand the type of the input."
# ),
# StructuredTool(
# name="Grade",
# func=generate_answer,
# description="Useful when you need to answer questions about students, grades, interests, etc from the context of canvas discussion posts. If the question is student specific, student name is required.",
# args_schema=ToolArgsSchema
# )
# ]
# # agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
#
# agent = FakeAgent(output_parser=CustomOutputParser())
# # prompt = CustomPromptTemplate(template=agent_prompt, tools=tools, input_variables=["input", "intermediate_steps"])
# # output_parser = CustomOutputParser()
# # tool_names = [tool.name for tool in tools]
# # llm_chain = LLMChain(llm=llm, prompt=prompt)
# # agent = LLMSingleActionAgent(
# # llm_chain=llm_chain,
# # output_parser=output_parser,
# # stop=["\nObservation:"],
# # allowed_tools=tool_names,
# # )
# agent_executor = AgentExecutor.from_agent_and_tools(
# agent=agent, tools=tools, verbose=True
# )
#
# # return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)
# return agent_executor
#
#
#
# def grade_answer(question) -> str:
# global chat_history, vectorstore_index
# agent = get_grading_agent()
# return agent.run(question)