from huggingface_hub import InferenceClient import gradio as gr inference_client = InferenceClient("google/gemma-7b-it") # format prompt as per the chat template on the official model page: https://huggingface.co/google/gemma-7b-it def format_prompt(input_text, history): prompt = "" if history: for previous_prompt, response in history: prompt += f"""user {previous_prompt} model {response}""" prompt += f"""user {input_text} model""" return prompt def generate(prompt, history): if not history: history = [] kwargs = dict( temperature=1.0, max_new_tokens=512, top_p=0.9, repetition_penalty=1, do_sample=True, ) formatted_prompt = format_prompt(prompt, history) response = inference_client.text_generation(formatted_prompt, **kwargs, stream=True, details=True, return_full_text=True) output = "" for chunk in response: output += chunk.token.text yield output return output chatbot = gr.Chatbot(height=500) with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.HTML("

Google Gemma 7B IT

") gr.ChatInterface( generate, chatbot=chatbot, retry_btn=None, undo_btn=None, clear_btn="Clear", description="This chatbot is using a Hugging Face Inference Client for the google/gemma-7b-it model.", examples=[["Explain artificial intelligence in a few lines."]] ) demo.queue().launch()