whisper-it / app.py
GIanlucaRub's picture
Update app.py
c731f97
import gradio as gr
from pytube import YouTube
from transformers import pipeline
import os
from moviepy.editor import VideoFileClip
pipe = pipeline(model="GIanlucaRub/whisper-small-it-3",task="automatic-speech-recognition")
def transcribe_yt(link):
yt = YouTube(link)
audio = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp3")
text = pipe(audio)["text"]
os.remove(audio)
return text
def transcribe_audio(audio):
text = pipe(audio)["text"]
return text
def populate_metadata(link):
yt = YouTube(link)
return yt.thumbnail_url, yt.title
def transcribe_video(video):
clip = VideoFileClip(video)
audio = video[:-4] + ".mp3"
clip.audio.write_audiofile(audio)
clip.close()
os.remove(video)
text = transcribe_audio(audio)
os.remove(audio)
return text
block = gr.Blocks()
with block:
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
<div>
<h1 style="font-size: 400%;line-height: 1.2;">Whisper Italian Automatic Speech Recognition</h1>
</div>
<p style="margin-bottom: 10px; font-size: 150%;margin-top: 30px;line-height: 1.2;">
Realtime demo for Italian speech recognition using a fine-tuned Whisper Small model.You can use the model in 4 different ways.
</p>
</div>
"""
)
with gr.Group():
with gr.Box():
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
<p style="margin-bottom: 10px; font-size: 100%;margin-top: 10px;line-height: 1.2;">
Here you can see the transcription.
</p>
</div>
""")
text = gr.Textbox(
label="Transcription",
placeholder="Transcription Output",
lines=5)
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
<p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.0;">
You can record audio from your microphone.
</p>
</div>
""")
microphone=gr.Audio(source="microphone", type="filepath")
with gr.Row().style(mobile_collapse=False, equal_height=True):
btn_microphone = gr.Button("Transcribe microphone audio")
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
<p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.2;">
You can upload an audio file.
</p>
</div>
""")
audio_uploaded=gr.Audio(source="upload", type="filepath")
with gr.Row().style(mobile_collapse=False, equal_height=True):
btn_audio_uploaded = gr.Button("Transcribe audio uploaded")
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
<p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.2;">
You can upload a video file
</p>
</div>
""")
video_uploaded = gr.Video(source = "upload")
with gr.Row().style(mobile_collapse=False, equal_height=True):
btn_video_uploaded = gr.Button("Transcribe video uploaded")
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
<p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.2;">
You can put a youtube video link
</p>
</div>
""")
link = gr.Textbox(label="YouTube Link")
with gr.Row().style(mobile_collapse=False, equal_height=True):
btn_youtube = gr.Button("Transcribe Youtube video")
with gr.Row().style(mobile_collapse=False, equal_height=True):
title = gr.Label(label="Video Title", placeholder="Title")
img = gr.Image(label="Thumbnail")
# Events
btn_youtube.click(transcribe_yt, inputs=[link], outputs=[text])
btn_microphone.click(transcribe_audio, inputs=[microphone], outputs=[text])
btn_audio_uploaded.click(transcribe_audio, inputs=[audio_uploaded], outputs=[text])
btn_video_uploaded.click(transcribe_video, inputs=[video_uploaded], outputs=[text])
link.change(populate_metadata, inputs=[link], outputs=[img, title])
block.launch(debug=True)