import os import requests import subprocess import json import time import spaces import gradio as gr from typing import List, Optional, Tuple, Dict DEFAULT_SYSTEM = "You are a helpful assistant." HF_MODEL_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF" HF_FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" ############################################### API_PATH_HEALTH = "/health" API_PATH_COMPLETIONS = "/chat/completions" LLAMA_CPP_SERVER_BASE = "http://127.0.0.1:8080" LLAMA_CPP_SERVER_START_TIMEOUT = 50 # seconds if not os.path.exists('model.gguf'): url = f"https://huggingface.co/{HF_MODEL_ID}/resolve/main/{HF_FILENAME}" subprocess.check_call(["curl", "-o", "model.gguf", "-L", url]) if not os.path.exists("llama-server"): # FIXME: currently, we can't build inside gradio container because nvcc is missing subprocess.check_call("curl -o llama-server -L https://ngxson-llamacpp-builder.hf.space/llama-server", shell=True) subprocess.check_call("chmod +x llama-server", shell=True) ############################################### class Role: SYSTEM = "system" USER = "user" ASSISTANT = "assistant" History = List[Tuple[str, str]] Messages = List[Dict[str, str]] def clear_session() -> History: return "", [] def modify_system_session(system: str) -> str: if system is None or len(system) == 0: system = DEFAULT_SYSTEM return system, system, [] def history_to_messages(history: History, system: str) -> Messages: messages = [{"role": Role.SYSTEM, "content": system}] for h in history: messages.append({"role": Role.USER, "content": h[0]}) messages.append({"role": Role.ASSISTANT, "content": h[1]}) return messages def messages_to_history(messages: Messages) -> Tuple[str, History]: assert messages[0]["role"] == Role.SYSTEM system = messages[0]["content"] history = [] for q, r in zip(messages[1::2], messages[2::2]): history.append([q["content"], r["content"]]) return system, history def wait_until_llamacpp_ready(): time.sleep(5) gr.Info("starting llama.cpp server...") trials = 0 while True: try: response = requests.get(LLAMA_CPP_SERVER_BASE + API_PATH_HEALTH) if response.status_code == 200: print("Status 200 received. Exiting loop.") break else: print(f"Received status {response.status_code}. Retrying...") except requests.exceptions.RequestException as e: print(f"Request failed: {e}") trials += 1 if trials > LLAMA_CPP_SERVER_START_TIMEOUT: raise TimeoutError("llama.cpp server did not start in time") time.sleep(1) # Wait for 1 second before retrying gr.Info("llama.cpp server is ready.") print("llama.cpp server is ready.") @spaces.GPU def model_chat(query: Optional[str], history: Optional[History], system: str ) -> Tuple[str, str, History]: if query is None: query = "" if history is None: history = [] # start llama.cpp server proc = subprocess.Popen(["./llama-server"], env=dict( os.environ, LLAMA_HOST="0.0.0.0", LLAMA_PORT="8080", LLAMA_ARG_CTX_SIZE=str(1024 * 32), LLAMA_ARG_FLASH_ATTN="1", LLAMA_ARG_MODEL="model.gguf", LLAMA_ARG_N_PARALLEL="1", LLAMA_ARG_N_GPU_LAYERS="9999", LLAMA_ARG_NO_MMAP="1", )) exception = None try: wait_until_llamacpp_ready() messages = history_to_messages(history, system) messages.append({"role": Role.USER, "content": query}) # adapted from https://gist.github.com/ggorlen/7c944d73e27980544e29aa6de1f2ac54 url = LLAMA_CPP_SERVER_BASE + API_PATH_COMPLETIONS headers = { # "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } data = { "temperature": 0.8, "messages": messages, "stream": True } response = requests.post(url, headers=headers, json=data, stream=True) response.raise_for_status() curr_text = "" for line in response.iter_lines(): line = line.decode("utf-8") if line.startswith("data: ") and not line.endswith("[DONE]"): data = json.loads(line[len("data: "):]) chunk = data["choices"][0]["delta"].get("content", "") # print(chunk, end="", flush=True) curr_text += chunk system, history = messages_to_history(messages + [{"role": Role.ASSISTANT, "content": curr_text}]) yield "", history, system except Exception as e: print(e) exception = e finally: # clean up proc.kill() if exception is not None: # re-raise the exception if needed raise exception with gr.Blocks() as demo: gr.Markdown(f"""
{HF_MODEL_ID}
""") with gr.Row(): with gr.Column(scale=3): system_input = gr.Textbox(value=DEFAULT_SYSTEM, lines=1, label="System") with gr.Column(scale=1): modify_system = gr.Button("๐Ÿ› ๏ธ Set system prompt and clear history", scale=2) system_state = gr.Textbox(value=DEFAULT_SYSTEM, visible=False) chatbot = gr.Chatbot(label=HF_MODEL_ID) textbox = gr.Textbox(lines=2, label="Input") with gr.Row(): clear_history = gr.Button("๐Ÿงน Clear history") sumbit = gr.Button("๐Ÿš€ Send") sumbit.click(model_chat, inputs=[textbox, chatbot, system_state], outputs=[textbox, chatbot, system_input], concurrency_limit = 5) clear_history.click(fn=clear_session, inputs=[], outputs=[textbox, chatbot]) modify_system.click(fn=modify_system_session, inputs=[system_input], outputs=[system_state, system_input, chatbot]) demo.queue(api_open=False) demo.launch(max_threads=5)