Spaces:

helenai
/

openvino_transformers_streaming

Sleeping

App Files Files Community

openvino_transformers_streaming / app.py

helenai

Switch to Mistral model

72324f9 9 months ago

raw

history blame

4.16 kB

	import pprint
	import subprocess
	from threading import Thread

	import gradio as gr
	from optimum.intel.openvino import OVModelForCausalLM
	from transformers import AutoTokenizer, TextIteratorStreamer

	result = subprocess.run(["lscpu"], text=True, capture_output=True)
	pprint.pprint(result.stdout)

	original_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
	model_id = "helenai/mistralai-Mistral-7B-Instruct-v0.2-ov"

	model = OVModelForCausalLM.from_pretrained(model_id)
	tokenizer = AutoTokenizer.from_pretrained(model_id)


	def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
	# message = [{"role": "user", "content": "You are a helpful assistant"}, {"role": "assistant", "content": "How can I help?"}, {"role":"user", "content":user_text}]
	message = [{"role": "user", "content": user_text}]

	model_inputs = tokenizer.apply_chat_template(message, return_tensors="pt", return_dict=True)

	# Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
	# in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
	streamer = TextIteratorStreamer(
	tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
	)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	temperature=float(temperature),
	top_k=top_k,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	# Pull the generated text from the streamer, and update the model output.
	model_output = ""
	for new_text in streamer:
	model_output += new_text
	yield model_output
	return model_output


	def reset_textbox():
	return gr.update(value="")


	with gr.Blocks() as demo:
	original_link = "https://huggingface.co/spaces/joaogante/transformers_streaming"
	gr.Markdown(
	"# OpenVINO and 🤗 Transformers 🔥Streaming🔥 on Gradio\n"
	"This demo showcases the use of the "
	"[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
	"of 🤗 Transformers with OpenVINO models and Gradio to generate text in real-time. It uses "
	f"[{original_model_id}](https://huggingface.co/{original_model_id}), "
	"converted to OpenVINO.\n\n"
	f"This space was duplicated from {original_link} and modified for OpenVINO models."
	)

	with gr.Row():
	with gr.Column(scale=4):
	user_text = gr.Textbox(
	label="User input",
	)
	model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
	button_submit = gr.Button(value="Submit")

	with gr.Column(scale=1):
	max_new_tokens = gr.Slider(
	minimum=1,
	maximum=1000,
	value=250,
	step=1,
	interactive=True,
	label="Max New Tokens",
	)
	top_p = gr.Slider(
	minimum=0.05,
	maximum=1.0,
	value=0.95,
	step=0.05,
	interactive=True,
	label="Top-p (nucleus sampling)",
	)
	top_k = gr.Slider(
	minimum=1,
	maximum=50,
	value=50,
	step=1,
	interactive=True,
	label="Top-k",
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=5.0,
	value=0.8,
	step=0.1,
	interactive=True,
	label="Temperature",
	)

	user_text.submit(
	run_generation,
	[user_text, top_p, temperature, top_k, max_new_tokens],
	model_output,
	)
	button_submit.click(
	run_generation,
	[user_text, top_p, temperature, top_k, max_new_tokens],
	model_output,
	)

	demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0")
	# For local use:
	# demo.launch(server_name="0.0.0.0")