Spaces:

fffiloni
/

whisper-to-stable-diffusion

Paused

App Files Files Community

whisper-to-stable-diffusion / app.py

fffiloni

Update app.py

4fee78b about 2 years ago

raw

history blame

2.78 kB

	import gradio as gr
	import whisper
	from PIL import Image

	import os
	MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')

	from diffusers import StableDiffusionPipeline

	whisper_model = whisper.load_model("small")

	device="cpu"

	pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN)
	pipe.to(device)

	def get_transcribe(audio):
	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)

	_, probs = whisper_model.detect_language(mel)

	options = whisper.DecodingOptions(task="translate", fp16 = False)
	result = whisper.decode(whisper_model, mel, options)

	print(result)
	print(result.text)
	return result.text

	def get_images(audio):
	prompt = get_transcribe(audio)
	#image = pipe(prompt, init_image=init_image)["sample"][0]
	images_list = pipe([prompt] * 2)
	images = []
	safe_image = Image.open(r"unsafe.png")
	for i, image in enumerate(images_list["sample"]):
	if(images_list["nsfw_content_detected"][i]):
	images.append(safe_image)
	else:
	images.append(image)

	return prompt, images
	#inputs
	audio = gr.Audio(label="Input Audio of an image description", show_label=True, source="microphone", type="filepath")
	#outputs
	translated_prompt = gr.Textbox(label="Translated audio", lines=6)
	gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[1], height="auto")
	title="Whisper to Stable Diffusion"
	description="""
	<p style='text-align: center;'>
	This demo is running on CPU. Offered by Sylvain <a href='https://twitter.com/fffiloni' target='_blank'>@fffiloni</a> • <img id='visitor-badge' alt='visitor badge' src='https://visitor-badge.glitch.me/badge?page_id=gradio-blocks.whisper-to-stable-diffusion' style='display: inline-block' /><br />
	Record an audio description of an image, stop recording, then hit the Submit button to get 2 images from Stable Diffusion.<br />
	Your audio will be translated to English through OpenAI's Whisper, then sent as a prompt to Stable Diffusion.
	Try it in French ! ;)
	—
	</p>
	"""

	article="""
	<p style='text-align: center;'>
	Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.<br />
	Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a>
	</p>
	"""
	gr.Interface(fn=get_images, inputs=audio, outputs=[translated_prompt, gallery], title=title, description=description).queue(max_size=1000).launch(enable_queue=True)