Spaces:

anzorq
/

w2v-bert-2.0-kbd

Paused

File size: 2,487 Bytes

bfb5ccb
1c4ba6c
8ca2e83
 
 
 
1c4ba6c
0863f8c
8ca2e83
0863f8c
1c4ba6c
bfb5ccb
8ca2e83
1c4ba6c
 
 
0863f8c
8ca2e83
eaed2c2
1c4ba6c
 
 
 
eaed2c2
550d732
1c4ba6c
 
550d732
 
 
eaed2c2
1c4ba6c
 
 
 
29c16a4
1c4ba6c
 
 
 
 
 
 
 
 
 
 
 
 
eaed2c2
 
6fd478d
eaed2c2
 
 
 
 
 
 
 
1c4ba6c
0863f8c
eaed2c2
1c4ba6c
eaed2c2
 
1c4ba6c
8ca2e83
0863f8c

import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pytube import YouTube
from transformers import pipeline

pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0)

@spaces.GPU
def transcribe_speech(audio):
    if audio is None:  # Handle the NoneType error for microphone input
        return "No audio received."

    return pipe(audio, chunk_length_s=10)['text']#, return_timestamps='word')

def transcribe_from_youtube(url):
    # Download audio from YouTube using pytube
    yt = YouTube(url)
    audio_path = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")

    # Transcribe the downloaded audio
    transcription = transcribe_speech(audio_path)

    # Clean up the downloaded file
    os.remove(audio_path)

    return transcription

def populate_metadata(url):
    yt = YouTube(url)
    return yt.thumbnail_url, yt.title

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
              <div>
                <h1>Youtube Speech Transcription</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                Speech to text transcription of Youtube videos using Wav2Vec2-BERT
              </p>
            </div>
        """
    )
    
    with gr.Tab("Microphone Input"):
        gr.Markdown("## Transcribe speech from microphone")
        mic_audio = gr.Audio(sources="microphone", type="filepath", label="Speak into your microphone")
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription")
        
        transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)

    with gr.Tab("YouTube URL"):
        gr.Markdown("## Transcribe speech from YouTube video")
        youtube_url = gr.Textbox(label="Enter YouTube video URL")
        title = gr.Label(label="Video Title")
        img = gr.Image(label="Thumbnail", height=120, width=120)
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
        
        transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
        youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])

demo.launch()