import gradio as gr # retrievers from langchain.chains import RetrievalQA import runpod import textwrap import time import torch import transformers from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline # models from langchain.llms import HuggingFacePipeline from InstructorEmbedding import INSTRUCTOR from langchain.embeddings import HuggingFaceInstructEmbeddings # prompts from langchain import PromptTemplate, LLMChain # vector stores from langchain.vectorstores import FAISS def get_model(): model_repo = 'daryl149/llama-2-13b-chat-hf' tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True) model = AutoModelForCausalLM.from_pretrained( model_repo, device_map='auto', load_in_4bit=True, torch_dtype=torch.float16, low_cpu_mem_usage=True, trust_remote_code=True ) max_len = 8192 return tokenizer,model,max_len tokenizer, model, max_len = get_model() temperature = 0, top_p = 0.95, repetition_penalty = 1.15 pipe = pipeline( task = "text-generation", model = "daryl149/llama-2-13b-chat-hf", tokenizer = tokenizer, pad_token_id = tokenizer.eos_token_id, max_length = max_len, temperature = temperature, top_p = top_p, repetition_penalty = repetition_penalty ) llm = HuggingFacePipeline(pipeline = pipe) # similar passages k = 3 embeddings_shl_path ="faiss_index_shl" embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2' ### download embeddings model embeddings = HuggingFaceInstructEmbeddings( model_name = embeddings_model_repo, model_kwargs = {"device": "cuda"} ) ### load vector DB embeddings vectordb = FAISS.load_local( embeddings_shl_path, embeddings ) prompt_template = """ Don't try to make up an answer, if you don't know just say that you don't know. Answer in the same language the question was asked. Don't mention in the answer the speaker just give the answer directly. Use only the following pieces of context to answer the question at the end. {context} Question: {question} Answer:""" PROMPT = PromptTemplate( template = prompt_template, input_variables = ["context", "question"] ) retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"}) qa_chain = RetrievalQA.from_chain_type( llm = llm, chain_type = "stuff", # map_reduce, map_rerank, stuff, refine retriever = retriever, chain_type_kwargs = {"prompt": PROMPT}, return_source_documents = True, verbose = False ) def wrap_text_preserve_newlines(text, width=700): # Split the input text into lines based on newline characters lines = text.split('\n') # Wrap each line individually wrapped_lines = [textwrap.fill(line, width=width) for line in lines] # Join the wrapped lines back together using newline characters wrapped_text = '\n'.join(wrapped_lines) return wrapped_text def process_llm_response(llm_response): ans = wrap_text_preserve_newlines(llm_response['result']) sources_used = ' \n'.join( [ " - " + source.metadata['source'].split('/')[-1][:-4] + "" for source in llm_response['source_documents'] ] ) ans += "\n Sand Hill Road podcast episodes based on your question : \n" + sources_used return ans,sources_used def text_generation(job): llm_response = qa_chain(job_input = job["prompt"]) ans,sources_used = process_llm_response(llm_response) return str(ans).replace("\n", "
") runpod.serverless.start({"handler": text_generation})