Spaces:

MedTiouti
/

SandHillRoadPodcast

Runtime error

Med Tiouti

Fix tunpod Mounted Volume Path

42bddce 10 months ago

3.55 kB

	import gradio as gr
	# retrievers
	from langchain.chains import RetrievalQA
	import runpod

	import textwrap
	import time

	import torch
	import transformers
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	# models
	from langchain.llms import HuggingFacePipeline
	from InstructorEmbedding import INSTRUCTOR
	from langchain.embeddings import HuggingFaceInstructEmbeddings

	# prompts
	from langchain import PromptTemplate, LLMChain

	# vector stores
	from langchain.vectorstores import FAISS
	import os

	cache_path = "/runpod-volume"
	model_repo = 'daryl149/llama-2-13b-chat-hf'

	tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True, cache_dir=cache_path)

	model = AutoModelForCausalLM.from_pretrained(
	model_repo,
	device_map='auto',
	load_in_4bit=True,
	torch_dtype=torch.float16,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	cache_dir=cache_path
	)
	max_len = 8192


	temperature = 0,
	top_p = 0.95,
	repetition_penalty = 1.15

	pipe = pipeline(
	task = "text-generation",
	model = model,
	tokenizer = tokenizer,
	pad_token_id = tokenizer.eos_token_id,
	max_length = max_len,
	temperature = temperature,
	top_p = top_p,
	repetition_penalty = repetition_penalty
	)

	llm = HuggingFacePipeline(pipeline = pipe)

	# similar passages
	k = 3

	embeddings_shl_path ="faiss_index_shl"
	embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'
	### download embeddings model
	embeddings = HuggingFaceInstructEmbeddings(
	model_name = embeddings_model_repo,
	model_kwargs = {"device": "cuda"}
	)

	### load vector DB embeddings
	vectordb = FAISS.load_local(
	embeddings_shl_path,
	embeddings
	)


	prompt_template = """
	Don't try to make up an answer, if you don't know just say that you don't know.
	Answer in the same language the question was asked.
	Don't mention in the answer the speaker just give the answer directly.
	Use only the following pieces of context to answer the question at the end.

	{context}

	Question: {question}
	Answer:"""


	PROMPT = PromptTemplate(
	template = prompt_template,
	input_variables = ["context", "question"]
	)

	retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})

	qa_chain = RetrievalQA.from_chain_type(
	llm = llm,
	chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
	retriever = retriever,
	chain_type_kwargs = {"prompt": PROMPT},
	return_source_documents = True,
	verbose = False
	)

	def wrap_text_preserve_newlines(text, width=700):
	# Split the input text into lines based on newline characters
	lines = text.split('\n')

	# Wrap each line individually
	wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

	# Join the wrapped lines back together using newline characters
	wrapped_text = '\n'.join(wrapped_lines)

	return wrapped_text

	def process_llm_response(llm_response):
	ans = wrap_text_preserve_newlines(llm_response['result'])

	sources_used = ' \n'.join(
	[
	"<b> - " + source.metadata['source'].split('/')[-1][:-4] + "</b>"
	for source in llm_response['source_documents']
	]
	)

	ans += "\n Sand Hill Road podcast episodes based on your question : \n" + sources_used
	return ans,sources_used

	def text_generation(job):
	# print(os.listdir(cache_path))
	llm_response = qa_chain(job["input"]["prompt"])
	ans,sources_used = process_llm_response(llm_response)

	return str(ans).replace("\n", "<br/>")



	runpod.serverless.start({"handler": text_generation})