Spaces:

Neprox
/

STT-Swedish

Runtime error

File size: 2,212 Bytes

ffde417
 
cad9f2f
 
c1c59f8
ffde417
cad9f2f
ffde417
c1c59f8
 
 
 
 
98a9509
c1c59f8
 
 
98a9509
c1c59f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a9509
 
c1c59f8
cad9f2f
 
c1c59f8
98a9509
c1c59f8
 
 
 
 
 
cad9f2f
 
 
 
ffde417
 
 
 
cad9f2f
c1c59f8
cad9f2f
 
ffde417

from transformers import pipeline
import gradio as gr
from pytube import YouTube
from transformers import Dataset, Audio
from moviepy.editor import AudioFileClip

pipe = pipeline(model="Neprox/model")

def download_from_youtube(url):
    streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
    fpath = streams.first().download()
    return fpath

def divide_into_30s_segments(audio_fpath):
    if not os.path.exists("segmented_audios"):
        os.makedirs("segmented_audios")

    sound = AudioFileClip(audio_fpath)
    n_full_segments = int(sound.duration / 30)
    len_last_segment = sound.duration % 30

    segment_paths = []
    segment_start_times = []

    for i in range(n_full_segments + 1):

        # Skip last segment if it is smaller than two seconds
        is_last_segment = i == n_full_segments
        if is_last_segment and not len_last_segment > 2:
            continue
        elif is_last_segment:
            end = start + len_last_segment
        else:
            end = (i + 1) * 30

        start = i * 30
        segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
        segment = sound.subclip(start, end)
        segment.write_audiofile(segment_path)
        segment_paths.append(segment_path)
        segment_start_times.append(start)

    return segment_paths, segment_start_times


def transcribe(audio, url):
    if url:
        fpath = download_from_youtube(url)
        segment_paths, segment_start_times = divide_into_30s_segments(fpath)

        audio_dataset = Dataset.from_dict({"audio": audio_segment_paths}).cast_column("audio", Audio())
        print(audio_dataset)
        text = pipe(audio_dataset)
        print(type(text))
        print(text)
        return text

    else:
        text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath"),
        gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be transcribed")
    ], 
    outputs="text",
    title="Whisper Small Swedish",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()