File size: 3,007 Bytes
94cbd93
ffde417
94cbd93
cad9f2f
fb79c03
c1c59f8
ffde417
cad9f2f
ffde417
c1c59f8
 
 
 
 
143dc39
 
 
51423ee
143dc39
40da39c
c1c59f8
 
 
98a9509
c1c59f8
 
 
40da39c
 
 
8f47d53
 
c1c59f8
 
 
40da39c
 
8f1f85c
c1c59f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a9509
 
c1c59f8
40da39c
cad9f2f
c1c59f8
40da39c
c1c59f8
143dc39
c1c59f8
8f47d53
143dc39
 
 
 
 
cad9f2f
 
 
 
ffde417
 
 
 
cad9f2f
c1c59f8
4bedefc
40da39c
cad9f2f
ffde417
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import gradio as gr
from transformers import pipeline
from pytube import YouTube
from datasets import Dataset, Audio
from moviepy.editor import AudioFileClip

pipe = pipeline(model="Neprox/model")

def download_from_youtube(url):
    streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
    fpath = streams.first().download()
    return fpath

def get_timestamp(seconds):
    minutes = int(seconds / 60)
    seconds = int(seconds % 60)
    return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"

def divide_into_30s_segments(audio_fpath, seconds_max):
    if not os.path.exists("segmented_audios"):
        os.makedirs("segmented_audios")

    sound = AudioFileClip(audio_fpath)
    n_full_segments = int(sound.duration / 30)
    len_last_segment = sound.duration % 30

    max_segments = int(seconds_max / 30)
    if n_full_segments > max_segments:
        n_full_segments = max_segments
        len_last_segment = 0

    segment_paths = []
    segment_start_times = []

    segments_available = n_full_segments + 1
    for i in range(min(segments_available, max_segments)):
        start = i * 30

        # Skip last segment if it is smaller than two seconds
        is_last_segment = i == n_full_segments
        if is_last_segment and not len_last_segment > 2:
            continue
        elif is_last_segment:
            end = start + len_last_segment
        else:
            end = (i + 1) * 30

        segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
        segment = sound.subclip(start, end)
        segment.write_audiofile(segment_path)
        segment_paths.append(segment_path)
        segment_start_times.append(start)

    return segment_paths, segment_start_times


def transcribe(audio, url, seconds_max):
    if url:
        fpath = download_from_youtube(url)
        segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)

        audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
        print(audio_dataset)
        print(audio_dataset[0])
        pred = pipe(audio_dataset["audio"])
        text = ""
        n_segments = len(segment_start_times)
        for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
            text += f"[Segment {i}/{n_segments}, start time {get_timestamp(seconds)}]\n{output['text']}\n"
        return text

    else:
        text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath"),
        gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be transcribed", label="YouTube URL"),
        gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to transcribe")
    ], 
    outputs="text",
    title="Whisper Small Swedish",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()