Spaces:
Runtime error
Runtime error
File size: 3,092 Bytes
91b03f9 a445827 0a5ec67 721cdc9 91b03f9 a445827 721cdc9 a445827 721cdc9 0cd712e 0a5ec67 c2a0993 0a5ec67 a445827 b17ecc2 a445827 b17ecc2 a445827 b17ecc2 a445827 b7d6aa3 721cdc9 ae4333a a445827 721cdc9 b7d6aa3 721cdc9 b7d6aa3 721cdc9 b17ecc2 a445827 b17ecc2 a445827 b17ecc2 a445827 b17ecc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
access_token = os.getenv('HF_TOKEN')
# Define the repository ID and access token
repo_id = "Mikhil-jivus/Llama-32-3B-FineTuned"
# Load the tokenizer and model from the Hugging Face repository
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
repo_id,
token=access_token,
torch_dtype=torch.bfloat16, # or use torch.bfloat16 if supported
device_map="auto" # Automatically use available GPU/CPU efficiently
)
# Define a function to clean up any repeated segments in the generated response
def clean_response(response, history):
# Check for repetition in the response and remove it
if len(history) > 0:
last_user_message, last_bot_response = history[-1]
if last_bot_response in response:
response = response.replace(last_bot_response, "").strip()
return response
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Add system prompt only once at the beginning of the conversation
if len(history) == 0:
input_text = f"system: {system_message}\nuser: {message}\n"
else:
input_text = f"user: {message}\n"
# Append previous conversation history to the input text
for user_msg, bot_msg in history:
input_text += f"user: {user_msg}\nassistant: {bot_msg}\n"
# Tokenize the input messages
input_ids = tokenizer.encode(input_text, return_tensors="pt")
# Move input_ids to the GPU
input_ids = input_ids.to("cuda")
# Create attention mask and move to GPU
attention_mask = input_ids.ne(tokenizer.pad_token_id).long().to("cuda")
# Generate a response
chat_history_ids = model.generate(
input_ids,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
do_sample=True,
attention_mask=attention_mask,
)
# Decode the response
response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
# Clean the response to remove any repeated or unnecessary text
response = clean_response(response, history)
# Update history with the new user message and bot response
history.append((message, response))
return response
# Set up the Gradio app interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful and friendly assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch(share=True)
|