import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer import transformers adapters_name = "SohamNale/Soham_Misteral" model_name = "anakin87/zephyr-7b-alpha-sharded" device = "cuda" # the device to load the model onto bnb_config = transformers.BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained( model_name, load_in_4bit=True, torch_dtype=torch.bfloat16, quantization_config=bnb_config, device_map='auto' ) model = PeftModel.from_pretrained(model, adapters_name) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.bos_token_id = 1 stop_token_ids = [0] print(f"Successfully loaded the model {model_name} into memory") import gradio as gr def ui(text): device = "cuda:0" inputs = tokenizer(text, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_new_tokens=500) return(tokenizer.decode(outputs[0], skip_special_tokens=True)) demo = gr.Interface(fn=ui, inputs="text", outputs="text", title="Banking Query Assistant", description="Enter the query prompt for which you require assistance") demo.launch(inline = False)