Spaces:

DeepMount00
/

Lexora-Lite-3B-Chat

Sleeping

App Files Files Community

DeepMount00 commited on Oct 1

Commit

a1130ae

•

1 Parent(s): d7cc17c

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -108

app.py CHANGED Viewed

@@ -5,127 +5,137 @@ import gradio as gr
 import torch
 import spaces
-model_id = "DeepMount00/Llama-3-COT-ITA"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto").eval()  # to("cuda:0")
 DESCRIPTION = '''
 <div>
-<h1 style="text-align: center;">Meta Llama3 8B ITA</h1>
-<p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/DeepMount00/Llama-3-8b-Ita"><b>Meta Llama3 8b Chat ITA</b></a>.</p>
 </div>
 <div>
-  <p>This model, <strong>DeepMount00/Llama-3-8b-Ita</strong>, is currently the best open-source large language model for the Italian language. You can view its ranking and compare it with other models on the leaderboard at <a href="https://huggingface.co/spaces/FinancialSupport/open_ita_llm_leaderboard"><b>this site</b></a>.</p>
 </div>
 '''
-PLACEHOLDER = """
-<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/64f1bf6a8b550e875926a590/9IXg0qMUF0OV2cWPT8cZn.jpeg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.50;  ">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">DeepMount00 llama3</h1>
-   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Chiedimi qualsiasi cosa...</p>
-</div>
-"""
-css = """
-h1 {
-  text-align: center;
-  display: block;
-}
-"""
-prompt = """Sei un assistente virtuale avanzato, progettato per fornire risposte accurate, utili e tempestive. Segui queste linee guida:
-1. **Professionalità**: Rispondi sempre in modo educato e rispettoso.
-2. **Chiarezza**: Fornisci informazioni chiare e precise.
-3. **Empatia**: Mostra comprensione per le esigenze degli utenti.
-4. **Adattabilità**: Adattati agli stili di comunicazione degli utenti.
-5. **Privacy**: Non richiedere o raccogliere informazioni personali sensibili.
-6. **Supporto**: Assisti con domande generali, risoluzione di problemi tecnici e consigli."""
-@spaces.GPU(duration=120)
-def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int) -> str:
-    # Initialize the conversation with a system prompt
-    conversation = [{"role": "system", "content": f"{prompt}"}]
-    flat_history = [item for sublist in history for item in sublist]
-    if len(flat_history) > 16:
-        flat_history = flat_history[-16:]
-    # Rebuild the conversation from the trimmed history
-    for i in range(0, len(flat_history), 2):
-        conversation.extend([
-            {"role": "user", "content": flat_history[i]},
-            {"role": "assistant", "content": flat_history[i + 1]}
-        ])
-    # Add the current user message to the conversation
     conversation.append({"role": "user", "content": message})
-    # Prepare the input for the model
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
-    # Parameters for generating text
-    do_sample = True if temperature > 0 else False  # Use sampling unless temperature is 0
-    real_temperature = max(temperature, 0.001)  # Avoid zero temperature which disables sampling
-    # Generate a response from the model
-    generated_ids = model.generate(
-        input_ids=input_ids,
         max_new_tokens=max_new_tokens,
-        do_sample=do_sample,
-        temperature=real_temperature,
-        eos_token_id=tokenizer.eos_token_id
     )
-    input_length = input_ids.size(1)
-    new_tokens = generated_ids[:, input_length:]
-    decoded = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
-    final_response = decoded.strip("assistant")
-    if final_response.startswith(':'):
-        final_response = final_response.lstrip(':').strip()
-    return final_response
-# Gradio block
-chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
-with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
-    gr.ChatInterface(
-        fn=chat_llama3_8b,
-        chatbot=chatbot,
-        fill_height=True,
-        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
-        additional_inputs=[
-            gr.Slider(minimum=0,
-                      maximum=1,
-                      step=0.1,
-                      value=0.001,
-                      label="Temperature",
-                      render=False),
-            gr.Slider(minimum=128,
-                      maximum=4096,
-                      step=1,
-                      value=512,
-                      label="Max new tokens",
-                      render=False),
-        ],
-        examples=[
-            ['Quanto è alta la torre di Pisa?'],
-            ["Se un mattone pesa 1kg più mezzo mattone, quanto pesa il mattone? rispondi impostando l'equazione"],
-            ['Quanto fa 2 * 9?'],
-            ['Scrivi una funzione python che calcola i primi n numeri di fibonacci'],
-            ['Inventa tre indovinelli tutti diversi con le relative risposte in formato json']
-        ],
-        cache_examples=False,
-    )
 if __name__ == "__main__":
-    demo.launch()

 import torch
 import spaces
 DESCRIPTION = '''
 <div>
+<h1 style="text-align: center;">Lexora-Medium-7B</h1>
+<p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/DeepMount00/Lexora-Medium-7B"><b>Lexora-Medium-7B Chat ITA</b></a>.</p>
 </div>
 <div>
+  <p>This model, <strong>DeepMount00/Lexora-Medium-7B</strong>, is currently the best open-source large language model for the Italian language. You can view its ranking and compare it with other models on the leaderboard at <a href="https://huggingface.co/spaces/FinancialSupport/open_ita_llm_leaderboard"><b>this site</b></a>.</p>
 </div>
 '''
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model_id = "DeepMount00/Lexora-Medium-7B"
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    trust_remote_code=True,
+)
+model.eval()
+@spaces.GPU(duration=90)
+def generate(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    system_message: str = "",
+    max_new_tokens: int = 2048,
+    temperature: float = 0.0001,
+    top_p: float = 1.0,
+    top_k: int = 50,
+    repetition_penalty: float = 1.0,
+) -> Iterator[str]:
+    conversation = [{"role": "system", "content": system_message}]
+    for user, assistant in chat_history:
+        conversation.extend(
+            [
+                {"role": "user", "content": user},
+                {"role": "assistant", "content": assistant},
+            ]
+        )
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
         max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Textbox(
+            value="",
+            label="System message",
+            render=False,
+        ),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0,
+            maximum=4.0,
+            step=0.1,
+            value=0.001,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=1.0,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.0,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Ciao! Come stai?"],
+    ],
+    cache_examples=False,
+)
+with gr.Blocks(css="style.css", fill_height=True, theme="soft") as demo:
     gr.Markdown(DESCRIPTION)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
+    chat_interface.render()
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()