jais-13b-chat / app.py
Ibrahemqasim's picture
Update app.py
f2f8e53
import gradio as gr
import torch
from peft import prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM
model_path = "inception-mbzuai/jais-13b-chat"
prompt_eng = "### Instruction: \n\nComplete the conversation below between [|Human|] and [|AI|]:\n### Input: [|Human|] {Question}\n### Response: [|AI|]"
prompt_ar = "### Instruction: \n\nΨ£ΩƒΩ…Ω„ Ψ§Ω„Ω…Ψ­Ψ§Ψ―Ψ«Ψ© Ψ£Ψ―Ω†Ψ§Ω‡ Ψ¨ΩŠΩ† [|Human|] و [|AI|]:\n### Input: [|Human|] {Question}\n### Response: [|AI|]"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat", load_in_8bit=True, device_map="auto", trust_remote_code=True)
model = prepare_model_for_kbit_training(model)
def get_response(text,tokenizer=tokenizer,model=model):
input_ids = tokenizer(text, return_tensors="pt").input_ids
inputs = input_ids.to(device)
input_len = inputs.shape[-1]
generate_ids = model.generate(
inputs,
top_p=0.9,
temperature=0.3,
max_length=2048-input_len,
min_length=input_len + 4,
repetition_penalty=1.2,
do_sample=True,
)
response = tokenizer.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)[0]
response = response.split("### Response: [|AI|]")
return response
def greet():
ques= input()
text = prompt_ar.format_map({'Question':ques})
return get_response(text)