Spaces:

Mat17892
/

iris

Runtime error

File size: 2,421 Bytes

5fd0c28
f613acc
 
5fd0c28
8a91905
f613acc
 
 
 
5fd0c28
f613acc
 
 
 
 
 
 
 
 
 
5fd0c28
f613acc
 
5fd0c28
 
 
 
 
 
 
 
f613acc
5fd0c28
 
f613acc
5fd0c28
 
 
 
 
 
f613acc
5fd0c28
 
f613acc
 
5fd0c28
f613acc
 
 
 
 
 
 
 
 
5fd0c28
 
f613acc
 
 
 
 
5fd0c28
f613acc
 
5fd0c28
 
f613acc
5fd0c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f613acc

import gradio as gr
from unsloth import FastLanguageModel
import torch


# Load your model and tokenizer (make sure to adjust the path to where your model is stored)
max_seq_length = 2048  # Adjust as necessary
load_in_4bit = True  # Enable 4-bit quantization for reduced memory usage
model_path = "/content/drive/My Drive/llama_lora_model_1"  # Path to your custom model

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


# Respond function
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Prepare the system message
    messages = [{"role": "system", "content": system_message}]

    # Add history to the messages
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    # Add the current message from the user
    messages.append({"role": "user", "content": message})

    # Prepare the inputs for the model
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(device)

    # Generate the response using your model
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        use_cache=True,
    )

    # Decode the generated output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Return the response
    return response[0]


# Gradio interface setup
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()