Spaces:
Runtime error
Runtime error
import torch | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import transformers | |
adapters_name = "SohamNale/Soham_Misteral" | |
model_name = "anakin87/zephyr-7b-alpha-sharded" | |
device = "cuda" # the device to load the model onto | |
bnb_config = transformers.BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16 | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
load_in_4bit=True, | |
torch_dtype=torch.bfloat16, | |
quantization_config=bnb_config, | |
device_map='auto' | |
) | |
model = PeftModel.from_pretrained(model, adapters_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.bos_token_id = 1 | |
stop_token_ids = [0] | |
print(f"Successfully loaded the model {model_name} into memory") | |
import gradio as gr | |
def ui(text): | |
device = "cuda:0" | |
inputs = tokenizer(text, return_tensors="pt").to(device) | |
outputs = model.generate(**inputs, max_new_tokens=500) | |
return(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
demo = gr.Interface(fn=ui, inputs="text", outputs="text", title="Banking Query Assistant", description="Enter the query prompt for which you require assistance") | |
demo.launch(inline = False) |