from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline import gradio as gr import torch import spaces model_id = "deepapaikar/Llama_SCplusQA_10epochs" print("Before loading model") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_id, return_dict=True, device_map="auto", trust_remote_code=True, config=bnb_config, ) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token pipe = pipeline( task="text-generation", model=model, tokenizer=tokenizer, max_length=100, trust_remote_code=True, ) print("Model loaded successfully") #system_message = "Answer the questions truthfully and to the point." def generate_response(query, history): ans = pipe(f"[INST] {query} [/INST]") result= ans[0]['generated_text'] return result demo = gr.ChatInterface(generate_response) if __name__ == "__main__": demo.launch(share=True)