Spaces:
Runtime error
Runtime error
File size: 4,891 Bytes
94cbd93 ffde417 94cbd93 cad9f2f fb79c03 c1c59f8 e626047 142a301 c52791f e626047 321340c e626047 321340c c1c59f8 13b3459 c1c59f8 143dc39 13b3459 143dc39 51423ee 143dc39 40da39c 13b3459 c1c59f8 98a9509 c1c59f8 40da39c 8f47d53 c1c59f8 40da39c 8f1f85c c1c59f8 98a9509 8300d7e 13b3459 142a301 13b3459 8300d7e 3ab9658 c1c59f8 8300d7e 13b3459 142a301 13b3459 142a301 13b3459 cad9f2f c1c59f8 40da39c c1c59f8 143dc39 a3c12f3 e626047 8300d7e cad9f2f 20d4cb1 44d12e1 20d4cb1 b279700 ffde417 142a301 cad9f2f 142a301 e626047 cad9f2f ffde417 44d12e1 ffde417 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import os
import gradio as gr
from transformers import pipeline
from pytube import YouTube
from datasets import Dataset, Audio
from moviepy.editor import AudioFileClip
from deep_translator import GoogleTranslator
pipe = pipeline(model="Neprox/STT-Swedish-Whisper")
languages = [
"English (en)",
"German (de)",
"French (fr)",
"Spanish (es)",
]
def download_from_youtube(url):
"""
Downloads the video from the given YouTube URL and returns the path to the audio file.
"""
streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
fpath = streams.first().download()
return fpath
def get_timestamp(seconds):
"""
Creates %M:%S timestamp from seconds.
"""
minutes = int(seconds / 60)
seconds = int(seconds % 60)
return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"
def divide_into_30s_segments(audio_fpath, seconds_max):
"""
Divides the audio file into 30s segments and returns the paths to the segments and the start times of the segments.
:param audio_fpath: Path to the audio file.
:param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
"""
if not os.path.exists("segmented_audios"):
os.makedirs("segmented_audios")
sound = AudioFileClip(audio_fpath)
n_full_segments = int(sound.duration / 30)
len_last_segment = sound.duration % 30
max_segments = int(seconds_max / 30)
if n_full_segments > max_segments:
n_full_segments = max_segments
len_last_segment = 0
segment_paths = []
segment_start_times = []
segments_available = n_full_segments + 1
for i in range(min(segments_available, max_segments)):
start = i * 30
# Skip last segment if it is smaller than two seconds
is_last_segment = i == n_full_segments
if is_last_segment and not len_last_segment > 2:
continue
elif is_last_segment:
end = start + len_last_segment
else:
end = (i + 1) * 30
segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
segment = sound.subclip(start, end)
segment.write_audiofile(segment_path)
segment_paths.append(segment_path)
segment_start_times.append(start)
return segment_paths, segment_start_times
def get_translation(text, target_lang="English (en)"):
"""
Translates the given Swedish text to the language specified.
"""
lang_code = target_lang.split(" ")[-1][1:-1]
return GoogleTranslator(source='sv', target=lang_code).translate(text)
def translate(audio, url, seconds_max, target_lang):
"""
Translates a YouTube video if a url is specified and returns the transcription.
If not url is specified, it translates the audio file as passed by Gradio.
:param audio: Audio file as passed by Gradio. Only used if no url is specified.
:param url: URL of the YouTube video to translate.
:param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
"""
if url:
fpath = download_from_youtube(url)
segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)
audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
pred = pipe(audio_dataset["audio"])
text = ""
n_segments = len(segment_start_times)
for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
text += f"{output['text']}\n"
text += f"[Translation to {target_lang}]\n"
text += f"{get_translation(output['text'], target_lang)}\n\n"
return text
else:
transcribed_text = pipe(audio)["text"]
text = "[Transcription]\n"
text += f"{transcribed_text}\n"
text += f"[Translation to {target_lang}]\n"
text += get_translation(transcribed_text, target_lang)
return text
iface = gr.Interface(
fn=translate,
inputs=[
gr.Audio(source="microphone", type="filepath", label="Translate from Microphone"),
gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be translated", label="Translate from YouTube URL"),
gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to translate from YouTube URL"),
gr.Dropdown(languages, value="English (en)", label="Target language")
],
outputs="text",
title="Whisper Small Swedish",
description="Realtime demo for Swedish speech recognition with translation using a fine-tuned Whisper small model.\nChoose EITHER a YouTube URL or use the microphone to record the audio to translate.",
)
iface.launch()
|