import time from threading import Thread from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import gradio as gr import torch import spaces model_id = "DeepMount00/Llama-3-8b-Ita" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto").eval() # to("cuda:0") DESCRIPTION = '''

Meta Llama3 8B ITA

This Space demonstrates the instruction-tuned model Meta Llama3 8b Chat ITA.

This model, DeepMount00/Llama-3-8b-Ita, is currently the best open-source large language model for the Italian language. You can view its ranking and compare it with other models on the leaderboard at this site.

''' PLACEHOLDER = """

DeepMount00 llama3

Chiedimi qualsiasi cosa...

""" css = """ h1 { text-align: center; display: block; } """ @spaces.GPU(duration=120) def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int) -> str: # Initialize the conversation with a system prompt conversation = [{"role": "system", "content": "Sei un assistente specializzato nella lingua italiana."}] flat_history = [item for sublist in history for item in sublist] if len(flat_history) > 16: flat_history = flat_history[-16:] # Rebuild the conversation from the trimmed history for i in range(0, len(flat_history), 2): conversation.extend([ {"role": "user", "content": flat_history[i]}, {"role": "assistant", "content": flat_history[i + 1]} ]) # Add the current user message to the conversation conversation.append({"role": "user", "content": message}) # Prepare the input for the model input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device) # Parameters for generating text do_sample = True if temperature > 0 else False # Use sampling unless temperature is 0 real_temperature = max(temperature, 0.001) # Avoid zero temperature which disables sampling # Generate a response from the model generated_ids = model.generate( input_ids=input_ids, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=real_temperature, eos_token_id=tokenizer.eos_token_id ) input_length = input_ids.size(1) new_tokens = generated_ids[:, input_length:] decoded = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0] final_response = decoded.strip("assistant") if final_response.startswith(':'): final_response = final_response.lstrip(':').strip() return final_response # Gradio block chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface') with gr.Blocks(fill_height=True, css=css) as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button") gr.ChatInterface( fn=chat_llama3_8b, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.001, label="Temperature", render=False), gr.Slider(minimum=128, maximum=4096, step=1, value=512, label="Max new tokens", render=False), ], examples=[ ['Quanto è alta la torre di Pisa?'], ["Se un mattone pesa 1kg più mezzo mattone, quanto pesa il mattone? rispondi impostando l'equazione"], ['Quanto fa 2 * 9?'], ['Scrivi una funzione python che calcola i primi n numeri di fibonacci'], ['Inventa tre indovinelli tutti diversi con le relative risposte in formato json'] ], cache_examples=False, ) if __name__ == "__main__": demo.launch()