Spaces:
Sleeping
Sleeping
File size: 5,331 Bytes
efe44f0 18f610c efe44f0 d7cc17c efe44f0 587157c efe44f0 d7cc17c efe44f0 6b8d341 d7cc17c 16cd8af 6b8d341 efe44f0 6b8d341 efe44f0 6b8d341 efe44f0 6b8d341 efe44f0 16cd8af efe44f0 16cd8af 8d3b84c efe44f0 8d3b84c efe44f0 31fe49a efe44f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import time
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
import torch
import spaces
model_id = "DeepMount00/Llama-3-COT-ITA"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto").eval() # to("cuda:0")
DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Meta Llama3 8B ITA</h1>
<p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/DeepMount00/Llama-3-8b-Ita"><b>Meta Llama3 8b Chat ITA</b></a>.</p>
</div>
<div>
<p>This model, <strong>DeepMount00/Llama-3-8b-Ita</strong>, is currently the best open-source large language model for the Italian language. You can view its ranking and compare it with other models on the leaderboard at <a href="https://huggingface.co/spaces/FinancialSupport/open_ita_llm_leaderboard"><b>this site</b></a>.</p>
</div>
'''
PLACEHOLDER = """
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
<img src="https://cdn-avatars.huggingface.co/v1/production/uploads/64f1bf6a8b550e875926a590/9IXg0qMUF0OV2cWPT8cZn.jpeg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.50; ">
<h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">DeepMount00 llama3</h1>
<p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Chiedimi qualsiasi cosa...</p>
</div>
"""
css = """
h1 {
text-align: center;
display: block;
}
"""
prompt = """Sei un assistente virtuale avanzato, progettato per fornire risposte accurate, utili e tempestive. Segui queste linee guida:
1. **Professionalità**: Rispondi sempre in modo educato e rispettoso.
2. **Chiarezza**: Fornisci informazioni chiare e precise.
3. **Empatia**: Mostra comprensione per le esigenze degli utenti.
4. **Adattabilità**: Adattati agli stili di comunicazione degli utenti.
5. **Privacy**: Non richiedere o raccogliere informazioni personali sensibili.
6. **Supporto**: Assisti con domande generali, risoluzione di problemi tecnici e consigli."""
@spaces.GPU(duration=120)
def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int) -> str:
# Initialize the conversation with a system prompt
conversation = [{"role": "system", "content": f"{prompt}"}]
flat_history = [item for sublist in history for item in sublist]
if len(flat_history) > 16:
flat_history = flat_history[-16:]
# Rebuild the conversation from the trimmed history
for i in range(0, len(flat_history), 2):
conversation.extend([
{"role": "user", "content": flat_history[i]},
{"role": "assistant", "content": flat_history[i + 1]}
])
# Add the current user message to the conversation
conversation.append({"role": "user", "content": message})
# Prepare the input for the model
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
# Parameters for generating text
do_sample = True if temperature > 0 else False # Use sampling unless temperature is 0
real_temperature = max(temperature, 0.001) # Avoid zero temperature which disables sampling
# Generate a response from the model
generated_ids = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
temperature=real_temperature,
eos_token_id=tokenizer.eos_token_id
)
input_length = input_ids.size(1)
new_tokens = generated_ids[:, input_length:]
decoded = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
final_response = decoded.strip("assistant")
if final_response.startswith(':'):
final_response = final_response.lstrip(':').strip()
return final_response
# Gradio block
chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
with gr.Blocks(fill_height=True, css=css) as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
gr.ChatInterface(
fn=chat_llama3_8b,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Slider(minimum=0,
maximum=1,
step=0.1,
value=0.001,
label="Temperature",
render=False),
gr.Slider(minimum=128,
maximum=4096,
step=1,
value=512,
label="Max new tokens",
render=False),
],
examples=[
['Quanto è alta la torre di Pisa?'],
["Se un mattone pesa 1kg più mezzo mattone, quanto pesa il mattone? rispondi impostando l'equazione"],
['Quanto fa 2 * 9?'],
['Scrivi una funzione python che calcola i primi n numeri di fibonacci'],
['Inventa tre indovinelli tutti diversi con le relative risposte in formato json']
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch()
|