Spaces:
Paused
Paused
import gradio as gr | |
import plotly.express as px | |
import os | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
# Set environment variables for GPU usage and memory allocation | |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' | |
torch.cuda.empty_cache() | |
torch.cuda.set_per_process_memory_fraction(0.8) # Adjust the fraction as needed | |
# Define device | |
device = "cuda" # The device to load the model onto | |
# System message (placeholder, adjust as needed) | |
system_message = "" | |
# Load the model and tokenizer | |
def hermes_model(): | |
tokenizer = AutoTokenizer.from_pretrained("TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ") | |
model = AutoModelForCausalLM.from_pretrained( | |
"TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ", low_cpu_mem_usage=True, device_map="auto" | |
) | |
return model, tokenizer | |
model, tokenizer = hermes_model() | |
# Function to generate a response from the model | |
def chat_response(msg_prompt: str) -> str: | |
""" | |
Generates a response from the model given a prompt. | |
Args: | |
msg_prompt (str): The user's message prompt. | |
Returns: | |
str: The model's response. | |
""" | |
generation_params = { | |
"do_sample": True, | |
"temperature": 0.7, | |
"top_p": 0.95, | |
"top_k": 40, | |
"max_new_tokens": 512, | |
"repetition_penalty": 1.1, | |
} | |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **generation_params) | |
try: | |
prompt_template = f'''system | |
{system_message} | |
user | |
{msg_prompt} | |
assistant | |
''' | |
pipe_output = pipe(prompt_template)[0]['generated_text'] | |
# Separate assistant's response from the output | |
response_lines = pipe_output.split('assistant') | |
assistant_response = response_lines[-1].strip() if len(response_lines) > 1 else pipe_output.strip() | |
return assistant_response | |
except Exception as e: | |
return str(e) | |
# Function to generate a random plot | |
def random_plot(): | |
df = px.data.iris() | |
fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species", | |
size='petal_length', hover_data=['petal_width']) | |
return fig | |
# Function to handle likes/dislikes (for demonstration purposes) | |
def print_like_dislike(x: gr.LikeData): | |
print(x.index, x.value, x.liked) | |
# Function to add messages to the chat history | |
def add_message(history, message): | |
for x in message["files"]: | |
history.append(((x,), None)) | |
if message["text"] is not None: | |
history.append((message["text"], None)) | |
return history, gr.update(value=None, interactive=True) | |
# Function to simulate the bot response | |
def bot(history): | |
user_message = history[-1][0] | |
bot_response = chat_response(user_message) | |
history[-1][1] = bot_response | |
return history | |
fig = random_plot() | |
# Gradio interface setup | |
with gr.Blocks(fill_height=True) as demo: | |
chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, scale=1) | |
chat_input = gr.MultimodalTextbox( | |
interactive=True, | |
file_count="multiple", | |
placeholder="Enter message or upload file...", | |
show_label=False | |
) | |
chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input]) | |
bot_msg = chat_msg.then(bot, chatbot, chatbot) | |
bot_msg.then(lambda: gr.update(interactive=True), None, [chat_input]) | |
chatbot.like(print_like_dislike, None, None) | |
demo.queue() | |
demo.launch() | |