Spaces:

amd
/

llama3-8b-mi-amd

Running

App Files Files Community

Lohia, Aditya commited on Sep 17

Commit

a9409d4

•

1 Parent(s): 12e4d9f

Updated Spaces

Browse files

Files changed (4) hide show

app.py +147 -0
dialog.py +45 -0
gateway.py +90 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import os
+import gradio as gr
+from typing import Iterator
+from dialog import get_dialog_box
+from gateway import check_server_health, request_generation
+# CONSTANTS
+MAX_NEW_TOKENS: int = 2048
+# GET ENVIRONMENT VARIABLES
+CLOUD_GATEWAY_API = os.getenv("API_ENDPOINT")
+def toggle_ui():
+    """
+    Function to toggle the visibility of the UI based on the server health
+    Returns:
+        hide/show main ui/dialog
+    """
+    health = check_server_health(cloud_gateway_api=CLOUD_GATEWAY_API)
+    if health:
+        return gr.update(visible=True), gr.update(visible=False)    # Show main UI, hide dialog
+    else:
+        return gr.update(visible=False), gr.update(visible=True)    # Hide main UI, show dialog
+def generate(
+        message: str,
+        chat_history: list,
+        system_prompt: str,
+        max_new_tokens: int = 1024,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    """Send a request to backend, fetch the streaming responses and emit to the UI.
+    Args:
+        message (str): input message from the user
+        chat_history (list[tuple[str, str]]): entire chat history of the session
+        system_prompt (str): system prompt
+        max_new_tokens (int, optional): maximum number of tokens to generate, ignoring the number of tokens in the
+                                        prompt. Defaults to 1024.
+        temperature (float, optional): the value used to module the next token probabilities. Defaults to 0.6.
+        top_p (float, optional): if set to float<1, only the smallest set of most probable tokens with probabilities
+                                    that add up to top_p or higher are kept for generation. Defaults to 0.9.
+        top_k (int, optional): the number of highest probability vocabulary tokens to keep for top-k-filtering.
+                                Defaults to 50.
+        repetition_penalty (float, optional): the parameter for repetition penalty. 1.0 means no penalty.
+                                Defaults to 1.2.
+    Yields:
+        Iterator[str]: Streaming responses to the UI
+    """
+    # sample method to yield responses from the llm model
+    outputs = []
+    for text in request_generation(message=message,
+                                   system_prompt=system_prompt,
+                                   max_new_tokens=max_new_tokens,
+                                   temperature=temperature,
+                                   top_p=top_p,
+                                   top_k=top_k,
+                                   repetition_penalty=repetition_penalty,
+                                   cloud_gateway_api=CLOUD_GATEWAY_API):
+        outputs.append(text)
+        yield "".join(outputs)
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Textbox(label="System prompt", lines=6),
+        gr.Slider(
+            label="Max New Tokens",
+            minimum=1,
+            maximum=MAX_NEW_TOKENS,
+            step=1,
+            value=1024,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.1,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.95,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Hello there! How are you doing?"],
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'."],
+    ],
+    cache_examples=False,
+    chatbot=gr.Chatbot(
+            height=600)
+)
+with gr.Blocks(css="style.css", theme=gr.themes.Default()) as demo:
+    # Get the server status before displaying UI
+    visibility = check_server_health(CLOUD_GATEWAY_API)
+    # Container for the main interface
+    with gr.Column(visible=visibility, elem_id="main_ui") as main_ui:
+        gr.Markdown(f"""
+            # Llama-3 8B Chat
+            This Space is an Alpha release that demonstrates model [Llama-3-8b-chat](https://huggingface.co/meta-llama/Meta-Llama-3-8B) by Meta, a Llama 3 model with 8B parameters fine-tuned for chat instructions, running on AMD MI210 infrastructure. Feel free to play with it!
+            """)
+        chat_interface.render()
+    # Dialog box using Markdown for the error message
+    with gr.Row(visible=(not visibility), elem_id="dialog_box") as dialog_box:
+        # Add spinner and message
+        get_dialog_box()
+    # Timer to check server health every 5 seconds and update UI
+    timer = gr.Timer(value=10)
+    timer.tick(fn=toggle_ui, outputs=[main_ui, dialog_box])
+if __name__ == "__main__":
+    demo.queue(max_size=int(os.getenv("QUEUE"))).launch()

dialog.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import gradio as gr
+def get_dialog_box():
+    return gr.HTML("""
+            <div style="display: flex; align-items: center; justify-content: center; min-height: 80vh;">
+            <div style="display: flex; flex-direction: column; align-items: center;">
+                <!-- Spinner -->
+                <div class="loader" style="margin-top: 20px;"></div>
+                <!-- Message -->
+                <h2 style="color: orange; font-family: trebuchet ms, sans-serif; align-items: center;">The service is not working, please refresh or try again later!</h2>
+            </div>
+            </div>
+            <!-- Spinner CSS -->
+            <style>
+                /* HTML: <div class="loader"></div> */
+                .loader {
+                  width: 120px;
+                  height: 22px;
+                  border-radius: 40px;
+                  color: orange !important;
+                  border: 2px solid;
+                  position: relative;
+                  overflow: hidden;
+                }
+                .loader::before {
+                  content: "";
+                  position: absolute;
+                  margin: 2px;
+                  width: 14px;
+                  top: 0;
+                  bottom: 0;
+                  left: -20px;
+                  border-radius: inherit;
+                  background: currentColor;
+                  box-shadow: -10px 0 12px 3px currentColor;
+                  clip-path: polygon(0 5%, 100% 0,100% 100%,0 95%,-30px 50%);
+                  animation: l14 1s infinite linear;
+                }
+                @keyframes l14 {
+                  100% {left: calc(100% + 20px)}
+                }
+            </style>
+            """)

gateway.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import json
+import requests
+def check_server_health(cloud_gateway_api: str):
+    """
+    Use the appropriate API endpoint to check the server health.
+    Args:
+        cloud_gateway_api: API endpoint to probe.
+    Returns:
+        True if server is active, false otherwise.
+    """
+    try:
+        response = requests.get(cloud_gateway_api + "/health")
+        if response.status_code == 200:
+            return True
+    except requests.ConnectionError:
+        print("Failed to establish connection to the server.")
+    return False
+def request_generation(message: str,
+                       system_prompt: str,
+                       cloud_gateway_api: str,
+                       max_new_tokens: int = 1024,
+                       temperature: float = 0.6,
+                       top_p: float = 0.9,
+                       top_k: int = 50,
+                       repetition_penalty: float = 1.2, ):
+    """
+    Request streaming generation from the cloud gateway API. Uses the simple requests module with stream=True to utilize
+    token-by-token generation from LLM.
+    Args:
+        message: prompt from the user.
+        system_prompt: system prompt to append.
+        cloud_gateway_api (str): API endpoint to send the request.
+        max_new_tokens: maximum number of tokens to generate, ignoring the number of tokens in the prompt.
+        temperature: the value used to module the next token probabilities.
+        top_p: if set to float<1, only the smallest set of most probable tokens with probabilities that add up to top_p
+                or higher are kept for generation.
+        top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering.
+        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+    Returns:
+    """
+    payload = {
+        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": message}
+        ],
+        "max_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "repetition_penalty": repetition_penalty,
+        "top_k": top_k,
+        "stream": True  # Enable streaming
+    }
+    with requests.post(cloud_gateway_api + "/v1/chat/completions", json=payload, stream=True) as response:
+        for chunk in response.iter_lines():
+            if chunk:
+                # Convert the chunk from bytes to a string and then parse it as json
+                chunk_str = chunk.decode('utf-8')
+                # Remove the `data: ` prefix from the chunk if it exists
+                if chunk_str.startswith("data: "):
+                    chunk_str = chunk_str[len("data: "):]
+                # Skip empty chunks
+                if chunk_str.strip() == "[DONE]":
+                    break
+                # Parse the chunk into a JSON object
+                try:
+                    chunk_json = json.loads(chunk_str)
+                    # Extract the "content" field from the choices
+                    content = chunk_json["choices"][0]["delta"].get("content", "")
+                    # Print the generated content as it's streamed
+                    if content:
+                        yield content
+                except json.JSONDecodeError:
+                    # Handle any potential errors in decoding
+                    continue

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy==1.26.4
+pillow==10.4.0
+gradio==4.43.0
+fastapi==0.111.1
+websockets==11.0.3