STT-Swedish / app.py
Neprox's picture
Add basic version of transcribing youtube video
cad9f2f
raw
history blame
1.21 kB
from transformers import pipeline
import gradio as gr
from pytube import YouTube
from transformers import Dataset, Audio
pipe = pipeline(model="Neprox/model")
def transcribe(audio, url):
if url:
# Download YouTube video
streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
audio_fpath = streams.first().download()
# TODO:
# Process up to 10 minutes by segmenting into 30 second blocks
# Use pyMovie for selecting time ranges
# query every block individually
# Annotate text with timestamps
audio_dataset = Dataset.from_dict({"audio": [audio_fpath]}).cast_column("audio", Audio())
text = pipe(audio_dataset[0]["audio"])
return text
else:
text = pipe(audio)["text"]
return text
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath")
gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be transcribed")
],
outputs="text",
title="Whisper Small Swedish",
description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)
iface.launch()