# Importing libraries from llama_cpp import Llama from time import time import gradio as gr import psutil # Initing things print("! INITING LLAMA MODEL !") llm = Llama(model_path="./model.bin") # LLaMa model llama_model_name = "Vikhrmodels/Vikhr-Qwen-2.5-1.5B-Instruct-GGUF" # This is just for indication in "three dots menu" print("! INITING DONE !") # Preparing things to work title = "llama.cpp API" desc = '''

Hello, world!

This is showcase how to make own server with any Llama based model using llama_cpp.
I'm using here 1.5b model just for example. Also here's only CPU power.
But you can use GPU power as well!

How to GPU?

Change `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS` in Dockerfile on `CMAKE_ARGS="-DLLAMA_CUBLAS=on"`. Also you can try `DLLAMA_CLBLAST` or `DLLAMA_METAL`.

How to test it on own machine?

You can install Docker, build image and run it. I made `run-docker.sh` for ya. To stop container run `docker ps`, find name of container and run `docker stop _dockerContainerName_`
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.

''' + f"Memory used: {psutil.virtual_memory()[2]}
" + ''' Powered by llama-cpp-python and Gradio.

''' # Loading prompt with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() with open('system.message', 'r', encoding='utf-8') as f: system_message = f.read() def generate_answer(request: str, max_tokens: int = 256, custom_prompt: str = None): t0 = time() logs = f"Request: {request}\nMax tokens: {max_tokens}\nCustom prompt: {custom_prompt}\n" try: maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64 userPrompt = prompt.replace("{prompt}", request) userPrompt = userPrompt.replace( "{system_message}", custom_prompt if isinstance(custom_prompt, str) and len(custom_prompt.strip()) > 1 and custom_prompt.strip() not in ['', None, ' '] else system_message ) logs += f"\nFinal prompt: {userPrompt}\n" except: return "Not enough data! Check that you passed all needed data.", logs try: # this shitty fix will be until i willnt figure out why sometimes there is empty output counter = 1 while counter <= 3: logs += f"Attempt {counter} to generate answer...\n" output = llm(userPrompt, max_tokens=maxTokens, stop=["<|im_end|>", "<|end_of_turn|>"], echo=False) text = output["choices"][0]["text"] if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']: break counter += 1 logs += f"Final attempt: {counter}\n" if len(text.strip()) <= 1 or text.strip() in ['', None, ' ']: logs += f"Generated and aborted: {text}" text = "Sorry, but something went wrong while generating answer. Try again or fix code. If you are maintainer of this space, look into logs." logs += f"\nFinal: '''{text}'''" logs += f"\n\nTime spent: {time()-t0}" return text, logs except Exception as e: logs += str(e) logs += f"\n\nTime spent: {time()-t0}" return "Oops! Internal server error. Check the logs of space/instance.", logs print("! LOAD GRADIO INTERFACE !") demo = gr.Interface( fn=generate_answer, inputs=[ gr.components.Textbox(label="Input"), gr.components.Number(value=256), gr.components.Textbox(label="Custom system prompt"), ], outputs=[ gr.components.Textbox(label="Output"), gr.components.Textbox(label="Logs") ], title=title, description=desc, allow_flagging='never' ) demo.queue() print("! LAUNCHING GRADIO !") demo.launch(server_name="0.0.0.0")