SohamNale's picture
Create app.py
ac003a6
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
adapters_name = "SohamNale/Soham_Misteral"
model_name = "anakin87/zephyr-7b-alpha-sharded"
device = "cuda" # the device to load the model onto
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
torch_dtype=torch.bfloat16,
quantization_config=bnb_config,
device_map='auto'
)
model = PeftModel.from_pretrained(model, adapters_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.bos_token_id = 1
stop_token_ids = [0]
print(f"Successfully loaded the model {model_name} into memory")
import gradio as gr
def ui(text):
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=500)
return(tokenizer.decode(outputs[0], skip_special_tokens=True))
demo = gr.Interface(fn=ui, inputs="text", outputs="text", title="Banking Query Assistant", description="Enter the query prompt for which you require assistance")
demo.launch(inline = False)