Spaces:
Paused
Paused
File size: 2,487 Bytes
bfb5ccb 1c4ba6c 8ca2e83 1c4ba6c 0863f8c 8ca2e83 0863f8c 1c4ba6c bfb5ccb 8ca2e83 1c4ba6c 0863f8c 8ca2e83 eaed2c2 1c4ba6c eaed2c2 550d732 1c4ba6c 550d732 eaed2c2 1c4ba6c 29c16a4 1c4ba6c eaed2c2 6fd478d eaed2c2 1c4ba6c 0863f8c eaed2c2 1c4ba6c eaed2c2 1c4ba6c 8ca2e83 0863f8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pytube import YouTube
from transformers import pipeline
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0)
@spaces.GPU
def transcribe_speech(audio):
if audio is None: # Handle the NoneType error for microphone input
return "No audio received."
return pipe(audio, chunk_length_s=10)['text']#, return_timestamps='word')
def transcribe_from_youtube(url):
# Download audio from YouTube using pytube
yt = YouTube(url)
audio_path = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
# Transcribe the downloaded audio
transcription = transcribe_speech(audio_path)
# Clean up the downloaded file
os.remove(audio_path)
return transcription
def populate_metadata(url):
yt = YouTube(url)
return yt.thumbnail_url, yt.title
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;">
<div>
<h1>Youtube Speech Transcription</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Speech to text transcription of Youtube videos using Wav2Vec2-BERT
</p>
</div>
"""
)
with gr.Tab("Microphone Input"):
gr.Markdown("## Transcribe speech from microphone")
mic_audio = gr.Audio(sources="microphone", type="filepath", label="Speak into your microphone")
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription")
transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)
with gr.Tab("YouTube URL"):
gr.Markdown("## Transcribe speech from YouTube video")
youtube_url = gr.Textbox(label="Enter YouTube video URL")
title = gr.Label(label="Video Title")
img = gr.Image(label="Thumbnail", height=120, width=120)
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
demo.launch() |