Spaces:
Runtime error
Runtime error
File size: 4,535 Bytes
cdf6cb1 498ddeb 790c7af 498ddeb ab440a4 498ddeb dbb6c49 790c7af ab440a4 790c7af 498ddeb 790c7af 22c11b2 3c0fc42 cdf6cb1 3c0fc42 878d57a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# import dependencies
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import os
import gradio as gr
#from google.colab import drive
import chromadb
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain import HuggingFacePipeline
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
#import locale
#locale.getpreferredencoding = lambda: "UTF-8"
# specify model huggingface mode name
model_name = "anakin87/zephyr-7b-alpha-sharded"
#https://huggingface.co/anakin87/zephyr-7b-alpha-sharded
#HuggingFaceH4/zephyr-7b-alpha
#https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
"""
:param model_name: Name or path of the model to be loaded.
:return: Loaded quantized model.
"""
bnb_config = BitsAndBytesConfig(
#load_in_4bit=True,
load_in_4bit=False,
#bnb_4bit_use_double_quant=True,
bnb_4bit_use_double_quant=False,
bnb_4bit_quant_type="nf4"
#bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
#torch_dtype=torch.bfloat16,
quantization_config=bnb_config
)
return model
# fucntion for initializing tokenizer
def initialize_tokenizer(model_name: str):
"""
Initialize the tokenizer with the specified model_name.
:param model_name: Name or path of the model for tokenizer initialization.
:return: Initialized tokenizer.
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.bos_token_id = 1 # Set beginning of sentence token id
return tokenizer
# load model
model = load_quantized_model(model_name)
# initialize tokenizer
tokenizer = initialize_tokenizer(model_name)
# specify stop token ids
stop_token_ids = [0]
# load pdf files
loader = PyPDFDirectoryLoader(pdf_files)
documents = loader.load()
# split the documents in small chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #Chage the chunk_size and chunk_overlap as needed
all_splits = text_splitter.split_documents(documents)
# specify embedding model (using huggingface sentence transformer)
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
#model_kwargs = {"device": "cuda"}
#embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs)
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
#embed document chunks
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
# specify the retriever
retriever = vectordb.as_retriever()
# build huggingface pipeline for using zephyr-7b-alpha
pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
use_cache=True,
device_map="auto",
max_length=2048,
do_sample=True,
top_k=5,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id,
)
# specify the llm
llm = HuggingFacePipeline(pipeline=pipeline)
# build conversational retrieval chain with memory (rag) using langchain
def create_conversation(query: str, chat_history: list) -> tuple:
try:
memory = ConversationBufferMemory(
memory_key='chat_history',
return_messages=False
)
qa_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
memory=memory,
get_chat_history=lambda h: h,
)
result = qa_chain({'question': query, 'chat_history': chat_history})
chat_history.append((query, result['answer']))
return '', chat_history
except Exception as e:
chat_history.append((query, e))
return '', chat_history
# build gradio ui
with gr.Blocks() as demo:
chatbot = gr.Chatbot(label='Chat with your data (Zephyr 7B Alpha)')
msg = gr.Textbox()
clear = gr.ClearButton([msg, chatbot])
msg.submit(create_conversation, [msg, chatbot], [msg, chatbot])
demo.launch() |