STT-Swedish / app.py
Neprox's picture
Add test version of youtube processing functionality
c1c59f8
raw
history blame
2.13 kB
from transformers import pipeline
import gradio as gr
from pytube import YouTube
from transformers import Dataset, Audio
from moviepy.editor import AudioFileClip
pipe = pipeline(model="Neprox/model")
def download_from_youtube(url):
streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
fpath = streams.first().download()
return fpath
def create_30s_segments(fpath):
if not os.path.exists("segmented_audios"):
os.makedirs("segmented_audios")
sound = AudioFileClip(fpath)
n_full_segments = int(sound.duration / 30)
len_last_segment = sound.duration % 30
segment_paths = []
segment_start_times = []
for i in range(n_full_segments + 1):
# Skip last segment if it is smaller than two seconds
is_last_segment = i == n_full_segments
if is_last_segment and not len_last_segment > 2:
continue
elif is_last_segment:
end = start + len_last_segment
else:
end = (i + 1) * 30
start = i * 30
segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
segment = sound.subclip(start, end)
segment.write_audiofile(segment_path)
segment_paths.append(segment_path)
segment_start_times.append(start)
def transcribe(audio, url):
if url:
fpath = download_from_youtube(url)
audio_segment_paths = create_30s_segments(fpath)
audio_dataset = Dataset.from_dict({"audio": audio_segment_paths}).cast_column("audio", Audio())
print(audio_dataset)
text = pipe(audio_dataset)
print(type(text))
print(text)
return text
else:
text = pipe(audio)["text"]
return text
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath"),
gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be transcribed")
],
outputs="text",
title="Whisper Small Swedish",
description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)
iface.launch()