import gradio as gr from huggingface_hub import InferenceClient, get_inference_endpoint from openai import OpenAI import os MODEL_URL_MAP = {"EuroLLM-9B-Instruct": os.getenv('ENDPOINT_URL_9B'), } def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, model ): client = OpenAI( base_url=MODEL_URL_MAP.get(model), api_key=os.getenv('AUTH_TOKEN') ) messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" try: chat_completion = client.chat.completions.create( model=model, messages=messages, temperature=temperature, max_tokens=max_tokens, stream=True, ) for message in chat_completion: token = message.choices[0].delta.content response += token yield response except Exception: yield response demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="", label="System Prompt"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.0, maximum=4.0, value=0.0, step=0.1, label="Temperature"), gr.Dropdown(["EuroLLM-9B-Instruct"], label="Model Name", value="EuroLLM-9B-Instruct") ], ) if __name__ == "__main__": demo.launch()