|
import gradio as gr |
|
import whisper |
|
from PIL import Image |
|
|
|
import os |
|
MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD') |
|
|
|
from diffusers import StableDiffusionPipeline |
|
|
|
whisper_model = whisper.load_model("small") |
|
|
|
device="cpu" |
|
|
|
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN) |
|
pipe.to(device) |
|
|
|
def get_transcribe(audio): |
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
|
|
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device) |
|
|
|
_, probs = whisper_model.detect_language(mel) |
|
|
|
options = whisper.DecodingOptions(task="translate", fp16 = False) |
|
result = whisper.decode(whisper_model, mel, options) |
|
|
|
print(result) |
|
print(result.text) |
|
return result.text |
|
|
|
def get_images(audio): |
|
prompt = get_transcribe(audio) |
|
|
|
images_list = pipe([prompt] * 2) |
|
images = [] |
|
safe_image = Image.open(r"unsafe.png") |
|
for i, image in enumerate(images_list["sample"]): |
|
if(images_list["nsfw_content_detected"][i]): |
|
images.append(safe_image) |
|
else: |
|
images.append(image) |
|
|
|
return prompt, images |
|
|
|
audio = gr.Audio(label="Input Audio of an image description", show_label=True, source="microphone", type="filepath") |
|
|
|
translated_prompt = gr.Textbox(label="Translated audio", lines=6) |
|
gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[1], height="auto") |
|
title="Whisper to Stable Diffusion" |
|
description=""" |
|
<p style='text-align: center;'> |
|
This demo is running on CPU. Offered by Sylvain <a href='https://twitter.com/fffiloni' target='_blank'>@fffiloni</a> • <img id='visitor-badge' alt='visitor badge' src='https://visitor-badge.glitch.me/badge?page_id=gradio-blocks.whisper-to-stable-diffusion' style='display: inline-block' /><br /> |
|
Record an audio description of an image, stop recording, then hit the Submit button to get 2 images from Stable Diffusion.<br /> |
|
Your audio will be translated to English through OpenAI's Whisper, then sent as a prompt to Stable Diffusion. |
|
Try it in French ! ;) |
|
— |
|
</p> |
|
""" |
|
|
|
article=""" |
|
<p style='text-align: center;'> |
|
Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.<br /> |
|
Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> |
|
</p> |
|
""" |
|
gr.Interface(fn=get_images, inputs=audio, outputs=[translated_prompt, gallery], title=title, description=description).queue(max_size=1000).launch(enable_queue=True) |