File size: 5,074 Bytes
94cbd93
ffde417
94cbd93
cad9f2f
fb79c03
c1c59f8
ffde417
8300d7e
 
 
142a301
cad9f2f
8300d7e
 
142a301
 
321340c
 
 
 
 
 
 
 
 
 
 
ffde417
c1c59f8
13b3459
 
 
c1c59f8
 
 
 
143dc39
13b3459
 
 
143dc39
 
51423ee
143dc39
40da39c
13b3459
 
 
 
 
c1c59f8
 
 
98a9509
c1c59f8
 
 
40da39c
 
 
8f47d53
 
c1c59f8
 
 
40da39c
 
8f1f85c
c1c59f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a9509
 
8300d7e
13b3459
142a301
13b3459
8300d7e
 
 
 
142a301
c1c59f8
8300d7e
13b3459
142a301
 
13b3459
142a301
13b3459
 
cad9f2f
c1c59f8
40da39c
c1c59f8
143dc39
 
 
 
 
a3c12f3
 
8300d7e
 
cad9f2f
 
 
 
ffde417
 
 
142a301
cad9f2f
142a301
 
 
8300d7e
cad9f2f
ffde417
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import gradio as gr
from transformers import pipeline
from pytube import YouTube
from datasets import Dataset, Audio
from moviepy.editor import AudioFileClip

# import googletrans                  # googletrans api
# from googletrans import Translator  # googletrans api
from google_trans_new import google_translator  

pipe = pipeline(model="Neprox/model")
translator = google_translator()
# translator = Translator() # googletrans api

# Get languages available for translation
#languages = []
#for code, name in googletrans.LANGUAGES.items():
#    language = f"{name.capitalize()} ({code})"
#    languages.append(language)
languages = [
    "French (fr)",
    "English (en)",
    "German (de)",
    "Spanish (es)",
]


def download_from_youtube(url):
    """
    Downloads the video from the given YouTube URL and returns the path to the audio file.
    """
    streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
    fpath = streams.first().download()
    return fpath

def get_timestamp(seconds):
    """
    Creates %M:%S timestamp from seconds.
    """
    minutes = int(seconds / 60)
    seconds = int(seconds % 60)
    return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"

def divide_into_30s_segments(audio_fpath, seconds_max):
    """
    Divides the audio file into 30s segments and returns the paths to the segments and the start times of the segments.
    :param audio_fpath: Path to the audio file.
    :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
    """
    if not os.path.exists("segmented_audios"):
        os.makedirs("segmented_audios")

    sound = AudioFileClip(audio_fpath)
    n_full_segments = int(sound.duration / 30)
    len_last_segment = sound.duration % 30

    max_segments = int(seconds_max / 30)
    if n_full_segments > max_segments:
        n_full_segments = max_segments
        len_last_segment = 0

    segment_paths = []
    segment_start_times = []

    segments_available = n_full_segments + 1
    for i in range(min(segments_available, max_segments)):
        start = i * 30

        # Skip last segment if it is smaller than two seconds
        is_last_segment = i == n_full_segments
        if is_last_segment and not len_last_segment > 2:
            continue
        elif is_last_segment:
            end = start + len_last_segment
        else:
            end = (i + 1) * 30

        segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
        segment = sound.subclip(start, end)
        segment.write_audiofile(segment_path)
        segment_paths.append(segment_path)
        segment_start_times.append(start)

    return segment_paths, segment_start_times

def get_translation(text, target_lang="English (en)"):
    """
    Translates the given Swedish text to the language specified.
    """
    lang_code = target_lang.split(" ")[-1][1:-1]
    return translator.translate(text, lang_tgt=lang_code)
    # result = translator.translate(text, lang_code, 'sv')  # googletrans api
    # return result.text                                    # googletrans api


def translate(audio, url, seconds_max, target_lang):
    """
    Translates a YouTube video if a url is specified and returns the transcription.
    If not url is specified, it translates the audio file as passed by Gradio.
    :param audio: Audio file as passed by Gradio. Only used if no url is specified.
    :param url: URL of the YouTube video to translate.
    :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
    """
    if url:
        fpath = download_from_youtube(url)
        segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)

        audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
        pred = pipe(audio_dataset["audio"])
        text = ""
        n_segments = len(segment_start_times)
        for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
            text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
            text += f"{output['text']}\n"
            text += f"[Translation ({target_lang})]\n"
            text += f"{get_translation(output['text'], target_lang)}\n\n"
        return text

    else:
        text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=translate, 
    inputs=[
        gr.Audio(source="microphone", type="filepath", label="Translate from Microphone"),
        gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be translated", label="Translate from YouTube URL"),
        gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to translate from YouTube URL"),
        gr.Dropdown(languages, label="Target language")
    ], 
    outputs="text",
    title="Whisper Small Swedish",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()