Spaces:

CyberNative-AI
/

Colibri_8b_v0.1_chat

Runtime error

File size: 4,304 Bytes

8ca3bc5
a203e8e
 
b555c35
a203e8e
ac2d8d3
8ca3bc5
a203e8e
 
 
 
 
 
daacba2
 
a203e8e
 
 
 
 
 
daacba2
8ca3bc5
a203e8e
 
 
daacba2
931cd7a
a203e8e
 
8ca3bc5
 
 
a203e8e
 
 
 
 
 
 
 
 
 
 
 
8ca3bc5
a203e8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b92038
a203e8e
 
 
8ca3bc5
ac2d8d3
 
 
 
 
 
 
a203e8e
ac2d8d3
817178f
68bcc63
 
 
 
 
 
 
a203e8e
8ca3bc5
a203e8e
79fc053
8ca3bc5
a203e8e
 
 
 
 
 
 
 
 
 
 
 
f61b980
a203e8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ca3bc5

import gradio as gr
import os
import spaces
from transformers import AutoTokenizer, TextIteratorStreamer
from threading import Thread
from llama_cpp import Llama

# Set an environment variable
HF_TOKEN = os.environ.get("HF_TOKEN", None)


DESCRIPTION = '''
<div>
<h1 style="text-align: center;">CyberNative-AI/Colibri_8b_v0.1</h1>
<p>This Space demonstrates the CyberSecurity-tuned model <a href="https://huggingface.co/CyberNative-AI/Colibri_8b_v0.1"><b>Colibri_8b_v0.1</b></a>.
</div>
'''

LICENSE = """
<p/>
---
Colibri v0.1 is built on top of Dolphin Llama 3
"""

PLACEHOLDER = """
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
   <img src="https://huggingface.co/CyberNative-AI/Colibri_8b_v0.1/resolve/main/cybernative_ai_colibri_logo.jpeg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  "> 
   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Colibri_v0.1</h1>
   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
</div>
"""


css = """
h1 {
  text-align: center;
  display: block;
}
#duplicate-button {
  margin: auto;
  color: white;
  background: #1565c0;
  border-radius: 100vh;
}
"""

@spaces.GPU(duration=120)
def chat_llama3_8b(message: str, 
              history: list, 
              temperature: float, 
              max_new_tokens: int
             ) -> str:
    """
    Generate a streaming response using the llama3-8b model.
    Args:
        message (str): The input message.
        history (list): The conversation history used by ChatInterface.
        temperature (float): The temperature for generating the response.
        max_new_tokens (int): The maximum number of new tokens to generate.
    Returns:
        str: The generated response.
    """
    conversation = []
    conversation.append({"role": "system", "content": "You are Colibri, an advanced cybersecurity AI assistant developed by CyberNative AI."})
    for user, assistant in history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": message})

    llm = Llama.from_pretrained(
        repo_id="CyberNative-AI/Colibri_8b_v0.1_q5_gguf",
        filename="*Q5_K_M.gguf",
        chat_format="chatml",
        verbose=False,
        max_tokens=max_new_tokens,
        stop=["<|im_end|>"]
    )
    
    response=llm.create_chat_completion(messages=conversation, temperature=temperature)
    # Access the first (and likely only) choice in the response
    choice = response['choices'][0]

    # Extract the text content from the message within the choice
    text_response = choice['message']['content']

    yield text_response
        

# Gradio block
chatbot=gr.Chatbot(height=700, placeholder=PLACEHOLDER, label='Gradio ChatInterface')

with gr.Blocks(fill_height=True, css=css) as demo:
    
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=chat_llama3_8b,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(minimum=0,
                      maximum=1, 
                      step=0.1,
                      value=0.6, 
                      label="Temperature", 
                      render=False),
            gr.Slider(minimum=128, 
                      maximum=4096,
                      step=1,
                      value=512, 
                      label="Max new tokens", 
                      render=False ),
            ],
        examples=[
            ['What are the two main methods used in the research to collect DKIM information?'],
            ['What is the primary purpose of OS fingerprinting using tools like Nmap, and why might it not always be 100% accurate?'],
            ['What is 9,000 * 9,000?'],
            ['What technique can be used to enumerate SMB shares within a Windows environment from a Windows client?'],
            ['What is the primary benefit of interleaving in cybersecurity education and training?']
            ],
        cache_examples=False,
                     )
    
    gr.Markdown(LICENSE)
    
if __name__ == "__main__":
    demo.launch()