Spaces:

rohan13
/

canvas-discussion-grader

Runtime error

App Files Files Community

canvas-discussion-grader / utils.py

rohan13

Prompt changes for formatting response

4604a12 over 1 year ago

raw

history blame contribute delete

20 kB

	import os
	import pickle
	import langchain

	import faiss
	from langchain import HuggingFaceHub, PromptTemplate
	from langchain.chains import ConversationalRetrievalChain, LLMChain
	from langchain.chat_models import ChatOpenAI
	from langchain.llms import OpenAI
	from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
	from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
	from langchain.memory import ConversationBufferWindowMemory
	from langchain.prompts.chat import (
	ChatPromptTemplate,
	HumanMessagePromptTemplate,
	SystemMessagePromptTemplate,
	StringPromptTemplate
	)
	from langchain.output_parsers import PydanticOutputParser
	from langchain.tools.json.tool import JsonSpec

	from typing import List, Union, Callable
	from langchain.schema import AgentAction, AgentFinish
	import re
	from langchain.text_splitter import CharacterTextSplitter
	from custom_faiss import MyFAISS
	from langchain.cache import InMemoryCache
	from langchain.chat_models import ChatGooglePalm
	from langchain.document_loaders import JSONLoader
	from langchain.agents import initialize_agent, Tool, AgentType
	from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser, BaseMultiActionAgent
	from langchain.tools import StructuredTool
	from langchain.chains import create_tagging_chain
	from typing import List, Tuple, Any, Union
	from langchain.schema import AgentAction, AgentFinish
	from pydantic import BaseModel, Field
	from typing import Optional

	class ToolArgsSchema(BaseModel):
	student_name: Optional[str] = Field(description="The name of the student")
	question: str = Field(description="The question being asked")
	question_type: str = Field(description="The type of question being asked")
	interest: Optional[str] = Field(description="The interest of the student")

	class Config:
	schema_extra = {
	"required": ["question", "question_type"]
	}





	langchain.llm_cache = InMemoryCache()

	model_name = "GPT-4"

	pickle_file = "_vs.pkl"
	index_file = "_vs.index"
	models_folder = "models/"
	os.environ["LANGCHAIN_TRACING"] = "true"
	discussions_file_path = "discussion_entries.json"

	llm = OpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, verbose=True)

	embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

	chat_history = []

	memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10)

	vectorstore_index = None

	agent_prompt = """
	I am the LLM AI canvas discussion grading assistant.
	I can answer two types of questions: grade-based questions and interest-based questions.
	Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. ALWAYS return total score when it is grading based question.
	Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
	You have access to the following tools:

	{tools}

	Use the following format:

	Question: the input question you must answer
	Thought: you should always think about type of question it is
	Action: the action to take, should be one of [{tool_names}]
	Action Input: the input to the action
	Observation: the result of the action
	... (this Thought/Action/Action Input/Observation can repeat N times)
	Thought: I now know the final answer
	Final Answer: the final answer to the original input question

	Begin!

	Question: {input}
	{agent_scratchpad}
	"""

	# Set up a prompt template
	class CustomPromptTemplate(StringPromptTemplate):
	# The template to use
	template: str
	############## NEW ######################
	# The list of tools available
	tools_getter: Callable

	def format(self, **kwargs) -> str:
	# Get the intermediate steps (AgentAction, Observation tuples)
	# Format them in a particular way
	intermediate_steps = kwargs.pop("intermediate_steps")
	thoughts = ""
	for action, observation in intermediate_steps:
	thoughts += action.log
	thoughts += f"\nObservation: {observation}\nThought: "
	# Set the agent_scratchpad variable to that value
	kwargs["agent_scratchpad"] = thoughts
	############## NEW ######################
	tools = self.tools_getter(kwargs["input"])
	# Create a tools variable from the list of tools provided
	kwargs["tools"] = "\n".join(
	[f"{tool.name}: {tool.description}" for tool in tools]
	)
	# Create a list of tool names for the tools provided
	kwargs["tool_names"] = ", ".join([tool.name for tool in tools])
	return self.template.format(**kwargs)

	class CustomOutputParser(AgentOutputParser):

	def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
	print("llm_output")
	print(llm_output)
	# Check if agent should finish
	if "Final Answer:" in llm_output:
	return AgentFinish(
	# Return values is generally always a dictionary with a single `output` key
	# It is not recommended to try anything else at the moment :)
	return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
	log=llm_output,
	)
	# Parse out the action and action input
	regex = r"Action\s\d\s:(.?)\nAction\s\d\sInput\s\d\s:[\s](.)"
	match = re.search(regex, llm_output, re.DOTALL)
	if not match:
	raise ValueError(f"Could not parse LLM output: `{llm_output}`")
	action = match.group(1).strip()
	action_input = match.group(2)
	# Return the action and action input
	return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)

	system_template = """
	I am the LLM AI canvas discussion grading assistant.
	I can answer two types of questions: grade-based questions and interest-based questions.
	Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns.
	Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
	To grade student discussions, I will follow the rubric below.

	Student Post

	3 points: Post includes 8 nouns and text describing how these nouns relate to the student.
	2 points: Student's post includes 8 nouns but does not offer how those nouns relate to the student.
	1 point: Student's post has significant missing details.
	0 points: The student does not provide an initial post, or otherwise does not follow assignment instructions.


	Response to Others

	3 points: Student responds to at least 3 other student discussion threads AND responds to questions asked of them. Student posts insightful comments that prompt on target discussion. These posts also avoid throw away comments such as I agree, Me too, Good idea.
	2 points: Student was notably lacking in one criterion.
	1 point: Student was notably lacking in two criteria.
	0 points: The student does not interact in the threads of other students.
	I will be able to identify each student by name, and I will be able to share their likings, interests, and other characteristics. I will also be able to filter out students based on their interests.

	I will not deviate from the grading scheme. I will grade each discussion entry and reply carefully, and I will share the grades of all individuals by name on the basis of the rubric. I will ALWAYS return total score when it is grading based question.

	The discussions and their replies are in following format:
	Student Post: Student Name
	Reply to: Another Student Discussion ID

	Your answer to grade based questions should be in following format:
	Student Post: X points
	Response to Others: X points
	Total: X points

	Following are the relevant discussions to grade or answer the interest based questions
	----------------
	Discussions:
	{context}"""

	messages = [
	SystemMessagePromptTemplate.from_template(system_template),
	HumanMessagePromptTemplate.from_template("{question}"),
	]
	CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)


	def set_model_and_embeddings():
	global chat_history
	# set_model(model)
	# set_embeddings(model)
	chat_history = []

	def set_embeddings(model):
	global embeddings
	if model == "GPT-3.5" or model == "GPT-4":
	print("Loading OpenAI embeddings")
	embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
	elif model == "Flan UL2" or model == "Flan T5":
	print("Loading Hugging Face embeddings")
	embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")


	def get_search_index():
	global vectorstore_index, model_name
	if os.path.isfile(get_file_path(model_name, pickle_file)) and os.path.isfile(
	get_file_path(model_name, index_file)) and os.path.getsize(get_file_path(model_name, pickle_file)) > 0:
	# Load index from pickle file
	with open(get_file_path(model_name, pickle_file), "rb") as f:
	# search_index = Chroma(persist_directory=models_folder, embedding_function=embeddings)
	search_index = pickle.load(f)
	print("Loaded index")
	else:
	search_index = create_index(model_name)
	print("Created index")

	vectorstore_index = search_index
	return search_index


	def create_index(model):
	source_chunks = create_chunk_documents()
	search_index = search_index_from_docs(source_chunks)
	# search_index.persist()
	faiss.write_index(search_index.index, get_file_path(model, index_file))
	# Save index to pickle file
	with open(get_file_path(model, pickle_file), "wb") as f:
	pickle.dump(search_index, f)
	return search_index


	def get_file_path(model, file):
	# If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
	if model == "GPT-3.5" or model == "GPT-4":
	return models_folder + "openai" + file
	else:
	return models_folder + "hf" + file


	def search_index_from_docs(source_chunks):
	# print("source chunks: " + str(len(source_chunks)))
	# print("embeddings: " + str(embeddings))

	search_index = MyFAISS.from_documents(source_chunks, embeddings)
	return search_index


	def get_html_files():
	loader = DirectoryLoader('docs', glob="*/.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
	document_list = loader.load()
	for document in document_list:
	document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
	return document_list

	def metadata_func(record: dict, metadata: dict) -> dict:
	metadata["name"] = record.get("name")
	return metadata
	def get_json_file():
	global discussions_file_path
	loader = JSONLoader(
	file_path=discussions_file_path,
	jq_schema='.[]', metadata_func=metadata_func, content_key="message")
	return loader.load()
	def fetch_data_for_embeddings():
	# document_list = get_text_files()
	document_list = get_html_files()
	# document_list = get_json_file()
	print("document list: " + str(len(document_list)))
	return document_list


	def get_text_files():
	loader = DirectoryLoader('docs', glob="*/.txt", loader_cls=TextLoader, recursive=True)
	document_list = loader.load()
	return document_list


	def create_chunk_documents():
	sources = fetch_data_for_embeddings()

	splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)

	source_chunks = splitter.split_documents(sources)

	print("chunks: " + str(len(source_chunks)))

	return sources


	def get_qa_chain(vectorstore_index, question, metadata):
	global llm, model_name
	print(llm)
	filter_dict = {"name": metadata.student_name}
	# embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
	# compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
	retriever = get_retriever(filter_dict, vectorstore_index, metadata)

	print(retriever.get_relevant_documents(question))

	chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
	verbose=True, get_chat_history=get_chat_history,
	combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
	return chain


	def get_retriever(filter_dict, vectorstore_index, metadata):
	if metadata.question_type == "grade-based":
	retriever = vectorstore_index.as_retriever(search_type='mmr',
	search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10,
	'filter': filter_dict})

	else:
	retriever = vectorstore_index.as_retriever(search_type='mmr',
	search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10})

	return retriever


	def get_chat_history(inputs) -> str:
	res = []
	for human, ai in inputs:
	res.append(f"Human:{human}\nAI:{ai}")
	return "\n".join(res)


	def generate_answer(question, metadata: ToolArgsSchema) -> str:
	# print("filter: " + filter)
	global chat_history, vectorstore_index
	chain = get_qa_chain(vectorstore_index, question, metadata)

	result = chain(
	{"question": question, "chat_history": chat_history})
	chat_history.extend([(question, result["answer"])])
	sources = []
	print(result)

	for document in result['source_documents']:
	source = document.metadata['source']
	sources.append(source.split('/')[-1].split('.')[0])
	print(sources)

	source = ',\n'.join(set(sources))
	# return result['answer'] + '\nSOURCES: ' + source
	return result['answer']
	def get_question_type(question):

	parser = PydanticOutputParser(pydantic_object=ToolArgsSchema)
	prompt_template = """I can answer two types of questions: grade-based questions and interest-based questions.
	Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns.
	Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
	Question: {question}
	Find following information about the question asked. Return Optional empty if the information is not available.:
	Format instructions: {format_instructions}"""

	llm = OpenAI(temperature=0)
	prompt = PromptTemplate(template=prompt_template, input_variables=["question"], output_parser=parser, partial_variables={"format_instructions": parser.get_format_instructions()})
	llm_chain = LLMChain(
	llm=llm,
	prompt=prompt,

	)
	output = llm_chain.run(question)
	output = parser.parse(output)
	output = generate_answer(question, output)
	return output











	# class FakeAgent(BaseMultiActionAgent):
	# """Fake Custom Agent."""
	#
	# @property
	# def input_keys(self):
	# return ["input"]
	#
	# def plan(
	# self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
	# ) -> Union[List[AgentAction], AgentFinish]:
	# print("input keys")
	# print(self.input_keys)
	# print("intermediate steps")
	# print(intermediate_steps)
	# print("kwargs")
	# print(kwargs)
	#
	# """Given input, decided what to do.
	#
	# Args:
	# intermediate_steps: Steps the LLM has taken to date,
	# along with observations
	# **kwargs: User inputs.
	#
	# Returns:
	# Action specifying what tool to use.
	# """
	# if len(intermediate_steps) == 0:
	# first_action = AgentAction(tool="question type", tool_input=kwargs["input"], log="")
	# print("first action")
	# print(first_action)
	# second_action = AgentAction(tool="Grade",tool_input=kwargs["input"], log="")
	# print("second action")
	# print(second_action)
	# return [
	# first_action,
	# second_action,
	# ]
	# else:
	# return AgentFinish(return_values={"output": "bar"}, log="")
	#
	# async def aplan(
	# self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
	# ) -> Union[List[AgentAction], AgentFinish]:
	# """Given input, decided what to do.
	#
	# Args:
	# intermediate_steps: Steps the LLM has taken to date,
	# along with observations
	# **kwargs: User inputs.
	#
	# Returns:
	# Action specifying what tool to use.
	# """
	# if len(intermediate_steps) == 0:
	# return [
	# AgentAction(tool="question type", tool_input=kwargs["input"], log=""),
	# AgentAction(tool="Grade",
	# tool_input={
	# "student_name": kwargs["student_name"],
	# "question": kwargs["question"],
	# "question_type": kwargs["question_type"],
	# "interest": kwargs["interest"]
	# }, log=""),
	# ]
	# else:
	# return AgentFinish(return_values={"output": "bar"}, log="")
	#
	#
	# schema = {
	# "properties": {
	# "student_name" : {"type": "string", "description": "The name of the student"},
	# "question": {"type": "string", "description": "The question being asked"},
	# "question type" : {"type": "string",
	# "enum": ["student grades", "student specific", "interest specific"],
	# "description": "The type of question being asked"},
	# "interest" : {"type": "string", "description": "The interest of the student"},
	# },
	# "required": ["question", "question type"]
	# }





	# def get_tagging_chain(question)-> str:
	# global schema
	# chain = create_tagging_chain(schema, llm)
	# first_answer = chain.run(question)
	# print("first answer:")
	# print(first_answer)
	# return first_answer
	#
	#
	# def get_grading_agent():
	#
	# tools = [
	# Tool(
	# name="question type",
	# func=get_tagging_chain,
	# description="Useful when you need to understand the type of the input."
	# ),
	# StructuredTool(
	# name="Grade",
	# func=generate_answer,
	# description="Useful when you need to answer questions about students, grades, interests, etc from the context of canvas discussion posts. If the question is student specific, student name is required.",
	# args_schema=ToolArgsSchema
	# )
	# ]
	# # agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
	#
	# agent = FakeAgent(output_parser=CustomOutputParser())
	# # prompt = CustomPromptTemplate(template=agent_prompt, tools=tools, input_variables=["input", "intermediate_steps"])
	# # output_parser = CustomOutputParser()
	# # tool_names = [tool.name for tool in tools]
	# # llm_chain = LLMChain(llm=llm, prompt=prompt)
	# # agent = LLMSingleActionAgent(
	# # llm_chain=llm_chain,
	# # output_parser=output_parser,
	# # stop=["\nObservation:"],
	# # allowed_tools=tool_names,
	# # )
	# agent_executor = AgentExecutor.from_agent_and_tools(
	# agent=agent, tools=tools, verbose=True
	# )
	#
	# # return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)
	# return agent_executor
	#
	#
	#
	# def grade_answer(question) -> str:
	# global chat_history, vectorstore_index
	# agent = get_grading_agent()
	# return agent.run(question)