Spaces:

DeepMount00
/

Lexora-Lite-3B-Chat

Sleeping

File size: 8,432 Bytes

da784c8
efe44f0
da784c8
 
efe44f0
18f610c
da784c8
 
 
efe44f0
46da343
 
 
 
 
efe44f0
f24ac48
4d7115d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1e9709
 
 
 
 
 
f24ac48
4d7115d
 
f24ac48
4d7115d
 
f24ac48
 
4d7115d
f24ac48
 
4d7115d
 
f24ac48
 
 
4d7115d
 
c1e9709
f24ac48
c1e9709
f24ac48
 
 
 
c1e9709
 
f24ac48
 
 
c1e9709
 
 
f24ac48
 
 
 
 
c1e9709
 
 
f24ac48
 
 
 
 
c1e9709
 
f24ac48
 
 
 
 
 
 
 
 
 
 
c1e9709
 
f24ac48
 
 
 
 
c1e9709
 
f24ac48
c1e9709
f24ac48
 
c1e9709
 
f24ac48
 
 
 
 
4d7115d
 
f24ac48
 
4d7115d
f24ac48
 
 
4d7115d
 
f24ac48
 
4d7115d
f24ac48
4d7115d
 
f24ac48
 
 
 
 
 
 
 
 
 
 
 
c1e9709
 
 
efe44f0
f24ac48
 
 
efe44f0
 
a1130ae
f24ac48
4d7115d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1e9709
4d7115d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f24ac48
4d7115d
 
 
 
 
 
f24ac48
 
 
 
 
 
 
 
 
 
 
 
a71fb86
 
 
 
f24ac48
a71fb86
f24ac48
 
a71fb86
 
f24ac48
a71fb86
 
 
 
f24ac48
a71fb86
 
 
 
 
 
 
f24ac48
a71fb86
 
 
 
 
 
 
f24ac48
a71fb86
f24ac48
4d7115d
 
 
 
 
 
 
 
f24ac48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efe44f0
 
4d7115d
c1e9709

import os
from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import subprocess

subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)

# Constants and model initialization code remains the same
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_id = "DeepMount00/Lexora-Lite-3B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
)
model.eval()

CUSTOM_CSS = """
.container {
    max-width: 1000px !important;
    margin: auto !important;
}

.header {
    text-align: center;
    margin-bottom: 1rem;
    padding: 1rem;
}

.header h1 {
    font-size: 2rem;
    font-weight: 600;
    color: #1e293b;
    margin-bottom: 0.5rem;
}

.header p {
    color: #64748b;
    font-size: 1rem;
}

.chat-container {
    border-radius: 0.75rem;
    background: white;
    box-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1);
    height: calc(100vh - 200px);
    display: flex;
    flex-direction: column;
}

.message-container {
    padding: 1rem;
    margin-bottom: 0.5rem;
}

.user-message {
    background: #f8fafc;
    border-left: 3px solid #2563eb;
    padding: 1rem;
    margin: 0.5rem 0;
    border-radius: 0.5rem;
}

.assistant-message {
    background: white;
    border-left: 3px solid #64748b;
    padding: 1rem;
    margin: 0.5rem 0;
    border-radius: 0.5rem;
}

.controls-panel {
    position: fixed;
    right: 1rem;
    top: 1rem;
    width: 300px;
    background: white;
    padding: 1rem;
    border-radius: 0.5rem;
    box-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1);
    z-index: 1000;
    display: none;
}

.controls-button {
    position: fixed;
    right: 1rem;
    top: 1rem;
    z-index: 1001;
    background: #2563eb !important;
    color: white !important;
    padding: 0.5rem 1rem !important;
    border-radius: 0.5rem !important;
    font-size: 0.875rem !important;
    font-weight: 500 !important;
}

.input-area {
    border-top: 1px solid #e2e8f0;
    padding: 1rem;
    background: white;
    border-radius: 0 0 0.75rem 0.75rem;
}

.textbox {
    border: 1px solid #e2e8f0 !important;
    border-radius: 0.5rem !important;
    padding: 0.75rem !important;
    font-size: 1rem !important;
    box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05) !important;
}

.textbox:focus {
    border-color: #2563eb !important;
    outline: none !important;
    box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.2) !important;
}

.submit-button {
    background: #2563eb !important;
    color: white !important;
    padding: 0.5rem 1rem !important;
    border-radius: 0.5rem !important;
    font-size: 0.875rem !important;
    font-weight: 500 !important;
    transition: all 0.2s !important;
}

.submit-button:hover {
    background: #1d4ed8 !important;
}
"""

DESCRIPTION = '''
<div class="header">
    <h1>Lexora-Lite-3B Chat</h1>
    <p>An advanced Italian language model ready to assist you</p>
</div>
'''

# Generate function remains the same
@spaces.GPU(duration=90)
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    system_message: str = "",
    max_new_tokens: int = 2048,
    temperature: float = 0.0001,
    top_p: float = 1.0,
    top_k: int = 50,
    repetition_penalty: float = 1.0,
) -> Iterator[str]:
    conversation = [{"role": "system", "content": system_message}]
    for user, assistant in chat_history:
        conversation.extend(
            [
                {"role": "user", "content": user},
                {"role": "assistant", "content": assistant},
            ]
        )
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

def create_chat_interface():
    theme = gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="slate",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
        radius_size=gr.themes.sizes.radius_sm,
    )

    with gr.Blocks(css=CUSTOM_CSS, theme=theme) as demo:
        gr.Markdown(DESCRIPTION)

        with gr.Row():
            # Main chat column
            with gr.Column(scale=3):
                chat = gr.ChatInterface(
                    fn=generate,
                    additional_inputs=[
                        gr.Textbox(
                            value="",
                            label="System Message",
                            visible=False,
                        ),
                        gr.Slider(
                            label="Temperature",
                            minimum=0,
                            maximum=1.0,
                            step=0.1,
                            value=0.0001,
                            visible=False,
                        ),
                        gr.Slider(
                            label="Top-p",
                            minimum=0.05,
                            maximum=1.0,
                            step=0.05,
                            value=1.0,
                            visible=False,
                        ),
                        gr.Slider(
                            label="Top-k",
                            minimum=1,
                            maximum=1000,
                            step=1,
                            value=50,
                            visible=False,
                        ),
                        gr.Slider(
                            label="Repetition Penalty",
                            minimum=1.0,
                            maximum=2.0,
                            step=0.05,
                            value=1.0,
                            visible=False,
                        ),
                    ],
                    examples=[
                        ["Ciao! Come stai?"],
                        ["Raccontami una breve storia."],
                        ["Qual è il tuo piatto italiano preferito?"],
                    ],
                    cache_examples=False,
                )
            
            # Advanced settings panel
            with gr.Column(scale=1, visible=False) as settings_panel:
                gr.Markdown("### Advanced Settings")
                gr.Slider(
                    label="Temperature",
                    minimum=0,
                    maximum=1.0,
                    step=0.1,
                    value=0.0001,
                )
                gr.Slider(
                    label="Top-p",
                    minimum=0.05,
                    maximum=1.0,
                    step=0.05,
                    value=1.0,
                )
                gr.Slider(
                    label="Top-k",
                    minimum=1,
                    maximum=1000,
                    step=1,
                    value=50,
                )
                gr.Slider(
                    label="Repetition Penalty",
                    minimum=1.0,
                    maximum=2.0,
                    step=0.05,
                    value=1.0,
                )

if __name__ == "__main__":
    demo = create_chat_interface()
    demo.queue(max_size=20).launch()