Spaces:
Runtime error
Runtime error
import gradio as gr | |
# retrievers | |
from langchain.chains import RetrievalQA | |
import runpod | |
import textwrap | |
import time | |
import torch | |
import transformers | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
# models | |
from langchain.llms import HuggingFacePipeline | |
from InstructorEmbedding import INSTRUCTOR | |
from langchain.embeddings import HuggingFaceInstructEmbeddings | |
# prompts | |
from langchain import PromptTemplate, LLMChain | |
# vector stores | |
from langchain.vectorstores import FAISS | |
def get_model(): | |
model_repo = 'daryl149/llama-2-13b-chat-hf' | |
tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_repo, | |
device_map='auto', | |
load_in_4bit=True, | |
torch_dtype=torch.float16, | |
low_cpu_mem_usage=True, | |
trust_remote_code=True | |
) | |
max_len = 8192 | |
return tokenizer,model,max_len | |
tokenizer, model, max_len = get_model() | |
temperature = 0, | |
top_p = 0.95, | |
repetition_penalty = 1.15 | |
pipe = pipeline( | |
task = "text-generation", | |
model = "daryl149/llama-2-13b-chat-hf", | |
tokenizer = tokenizer, | |
pad_token_id = tokenizer.eos_token_id, | |
max_length = max_len, | |
temperature = temperature, | |
top_p = top_p, | |
repetition_penalty = repetition_penalty | |
) | |
llm = HuggingFacePipeline(pipeline = pipe) | |
# similar passages | |
k = 3 | |
embeddings_shl_path ="faiss_index_shl" | |
embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2' | |
### download embeddings model | |
embeddings = HuggingFaceInstructEmbeddings( | |
model_name = embeddings_model_repo, | |
model_kwargs = {"device": "cuda"} | |
) | |
### load vector DB embeddings | |
vectordb = FAISS.load_local( | |
embeddings_shl_path, | |
embeddings | |
) | |
prompt_template = """ | |
Don't try to make up an answer, if you don't know just say that you don't know. | |
Answer in the same language the question was asked. | |
Don't mention in the answer the speaker just give the answer directly. | |
Use only the following pieces of context to answer the question at the end. | |
{context} | |
Question: {question} | |
Answer:""" | |
PROMPT = PromptTemplate( | |
template = prompt_template, | |
input_variables = ["context", "question"] | |
) | |
retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"}) | |
qa_chain = RetrievalQA.from_chain_type( | |
llm = llm, | |
chain_type = "stuff", # map_reduce, map_rerank, stuff, refine | |
retriever = retriever, | |
chain_type_kwargs = {"prompt": PROMPT}, | |
return_source_documents = True, | |
verbose = False | |
) | |
def wrap_text_preserve_newlines(text, width=700): | |
# Split the input text into lines based on newline characters | |
lines = text.split('\n') | |
# Wrap each line individually | |
wrapped_lines = [textwrap.fill(line, width=width) for line in lines] | |
# Join the wrapped lines back together using newline characters | |
wrapped_text = '\n'.join(wrapped_lines) | |
return wrapped_text | |
def process_llm_response(llm_response): | |
ans = wrap_text_preserve_newlines(llm_response['result']) | |
sources_used = ' \n'.join( | |
[ | |
"<b> - " + source.metadata['source'].split('/')[-1][:-4] + "</b>" | |
for source in llm_response['source_documents'] | |
] | |
) | |
ans += "\n Sand Hill Road podcast episodes based on your question : \n" + sources_used | |
return ans,sources_used | |
def text_generation(job): | |
llm_response = qa_chain(job_input = job["prompt"]) | |
ans,sources_used = process_llm_response(llm_response) | |
return str(ans).replace("\n", "<br/>") | |
runpod.serverless.start({"handler": text_generation}) |