Spaces:
Runtime error
Runtime error
File size: 3,551 Bytes
5491a72 ca90067 5491a72 42bddce 5491a72 42bddce c22681b 5491a72 c22681b 5491a72 c22681b 0e1fedf c22681b 5491a72 c22681b 5491a72 2009b2e 5491a72 ca90067 42bddce cfd0845 5491a72 ca90067 5491a72 ca90067 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
# retrievers
from langchain.chains import RetrievalQA
import runpod
import textwrap
import time
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# models
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
# prompts
from langchain import PromptTemplate, LLMChain
# vector stores
from langchain.vectorstores import FAISS
import os
cache_path = "/runpod-volume"
model_repo = 'daryl149/llama-2-13b-chat-hf'
tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True, cache_dir=cache_path)
model = AutoModelForCausalLM.from_pretrained(
model_repo,
device_map='auto',
load_in_4bit=True,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
trust_remote_code=True,
cache_dir=cache_path
)
max_len = 8192
temperature = 0,
top_p = 0.95,
repetition_penalty = 1.15
pipe = pipeline(
task = "text-generation",
model = model,
tokenizer = tokenizer,
pad_token_id = tokenizer.eos_token_id,
max_length = max_len,
temperature = temperature,
top_p = top_p,
repetition_penalty = repetition_penalty
)
llm = HuggingFacePipeline(pipeline = pipe)
# similar passages
k = 3
embeddings_shl_path ="faiss_index_shl"
embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'
### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
model_name = embeddings_model_repo,
model_kwargs = {"device": "cuda"}
)
### load vector DB embeddings
vectordb = FAISS.load_local(
embeddings_shl_path,
embeddings
)
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Don't mention in the answer the speaker just give the answer directly.
Use only the following pieces of context to answer the question at the end.
{context}
Question: {question}
Answer:"""
PROMPT = PromptTemplate(
template = prompt_template,
input_variables = ["context", "question"]
)
retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})
qa_chain = RetrievalQA.from_chain_type(
llm = llm,
chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
retriever = retriever,
chain_type_kwargs = {"prompt": PROMPT},
return_source_documents = True,
verbose = False
)
def wrap_text_preserve_newlines(text, width=700):
# Split the input text into lines based on newline characters
lines = text.split('\n')
# Wrap each line individually
wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
# Join the wrapped lines back together using newline characters
wrapped_text = '\n'.join(wrapped_lines)
return wrapped_text
def process_llm_response(llm_response):
ans = wrap_text_preserve_newlines(llm_response['result'])
sources_used = ' \n'.join(
[
"<b> - " + source.metadata['source'].split('/')[-1][:-4] + "</b>"
for source in llm_response['source_documents']
]
)
ans += "\n Sand Hill Road podcast episodes based on your question : \n" + sources_used
return ans,sources_used
def text_generation(job):
# print(os.listdir(cache_path))
llm_response = qa_chain(job["input"]["prompt"])
ans,sources_used = process_llm_response(llm_response)
return str(ans).replace("\n", "<br/>")
runpod.serverless.start({"handler": text_generation}) |