Spaces:

Neprox
/

STT-Swedish

Runtime error

File size: 2,382 Bytes

94cbd93
ffde417
94cbd93
cad9f2f
fb79c03
c1c59f8
ffde417
cad9f2f
8f47d53
ffde417
c1c59f8
 
 
 
 
98a9509
c1c59f8
 
 
98a9509
c1c59f8
 
 
8f47d53
 
 
 
c1c59f8
 
 
 
8f1f85c
c1c59f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a9509
 
c1c59f8
cad9f2f
 
c1c59f8
98a9509
c1c59f8
3b7997e
c1c59f8
8f47d53
c1c59f8
 
 
cad9f2f
 
 
 
ffde417
 
 
 
cad9f2f
c1c59f8
cad9f2f
 
ffde417

import os
import gradio as gr
from transformers import pipeline
from pytube import YouTube
from datasets import Dataset, Audio
from moviepy.editor import AudioFileClip

pipe = pipeline(model="Neprox/model")
MAX_SEGMENTS = 10 # 5 minutes

def download_from_youtube(url):
    streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
    fpath = streams.first().download()
    return fpath

def divide_into_30s_segments(audio_fpath):
    if not os.path.exists("segmented_audios"):
        os.makedirs("segmented_audios")

    sound = AudioFileClip(audio_fpath)
    n_full_segments = int(sound.duration / 30)
    len_last_segment = sound.duration % 30

    if n_full_segments > MAX_SEGMENTS:
        n_full_segments = MAX_SEGMENTS
        len_last_segment = 0

    segment_paths = []
    segment_start_times = []

    for i in range(n_full_segments + 1):
        start = i * 30

        # Skip last segment if it is smaller than two seconds
        is_last_segment = i == n_full_segments
        if is_last_segment and not len_last_segment > 2:
            continue
        elif is_last_segment:
            end = start + len_last_segment
        else:
            end = (i + 1) * 30

        segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
        segment = sound.subclip(start, end)
        segment.write_audiofile(segment_path)
        segment_paths.append(segment_path)
        segment_start_times.append(start)

    return segment_paths, segment_start_times


def transcribe(audio, url):
    if url:
        fpath = download_from_youtube(url)
        segment_paths, segment_start_times = divide_into_30s_segments(fpath)

        audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio())
        print(audio_dataset)
        print(audio_dataset[0])
        text = pipe(audio_dataset)
        print(type(text))
        print(text)
        return text

    else:
        text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath"),
        gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be transcribed")
    ], 
    outputs="text",
    title="Whisper Small Swedish",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()