Spaces:
Runtime error
Runtime error
File size: 6,566 Bytes
77f334e fbeec50 4c622ed cd3acfe 4c622ed 8c3ddb5 fbeec50 cd3acfe c41dac2 5569679 c41dac2 07b98ad c41dac2 fbeec50 c41dac2 07b98ad c41dac2 fbeec50 c41dac2 77f334e c41dac2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
import os
import gradio as gr
from transformers import pipeline
from pytube import YouTube
from datasets import Dataset, Audio
from moviepy.editor import AudioFileClip
pipe = pipeline(model="irena/whisper-small-sv-SE")
def download_from_youtube(url):
"""
Downloads the video from the given YouTube URL and returns the path to the audio file.
"""
streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
fpath = streams.first().download()
return fpath
def get_timestamp(seconds):
"""
Creates %M:%S timestamp from seconds.
"""
minutes = int(seconds / 60)
seconds = int(seconds % 60)
return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"
def divide_into_30s_segments(audio_fpath, seconds_max):
"""
Divides the audio file into 30s segments and returns the paths to the segments and the start times of the segments.
:param audio_fpath: Path to the audio file.
:param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
"""
if not os.path.exists("segmented_audios"):
os.makedirs("segmented_audios")
sound = AudioFileClip(audio_fpath)
n_full_segments = int(sound.duration / 30)
len_last_segment = sound.duration % 30
max_segments = int(seconds_max / 30)
if n_full_segments > max_segments:
n_full_segments = max_segments
len_last_segment = 0
segment_paths = []
segment_start_times = []
segments_available = n_full_segments + 1
for i in range(min(segments_available, max_segments)):
start = i * 30
# Skip last segment if it is smaller than two seconds
is_last_segment = i == n_full_segments
if is_last_segment and not len_last_segment > 2:
continue
elif is_last_segment:
end = start + len_last_segment
else:
end = (i + 1) * 30
segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
segment = sound.subclip(start, end)
segment.write_audiofile(segment_path)
segment_paths.append(segment_path)
segment_start_times.append(start)
return segment_paths, segment_start_times
def get_translation(text):
"""
Translates the given Chinese text to English.
"""
return "TODO: Make API call to Google Translate to get English translation"
def transcribe(audio, url, seconds_max):
"""
Transcribes a YouTube video if a url is specified and returns the transcription.
If not url is specified, it transcribes the audio file as passed by Gradio.
:param audio: Audio file as passed by Gradio. Only used if no url is specified.
:param url: YouTube URL to transcribe.
:param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
"""
if url:
fpath = download_from_youtube(url)
segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)
audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
pred = pipe(audio_dataset["audio"])
text = ""
n_segments = len(segment_start_times)
for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
text += f"{output['text']}\n"
text += f"[Translation]\n{get_translation(output['text'])}\n\n"
return text
else:
text = pipe(audio)["text"]
return text
block = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
gr.Text(max_lines=1, placeholder="Enter YouTube Link which has a Chinese video", label="Transcribe from YouTube URL"),
gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to transcribe from YouTube URL")
],
outputs="text",
title="Whisper Small Chinese",
description="Realtime Chinese speech recognition",
)
block.launch()
'''
import os
import gradio as gr
from transformers import pipeline
import gradio as gr
import torch
import spacy
os.system('pip install https://huggingface.co/Armandoliv/es_pipeline/resolve/main/es_pipeline-any-py3-none-any.whl')
pipe = pipeline(model="irena/whisper-small-sv-SE")
nlp_ner = spacy.load("es_pipeline")
def main_generator(youtube_id:str):
YouTubeID = youtube_id.split("https://www.youtube.com/watch?v=") #
if len(YouTubeID)>1:
YouTubeID = YouTubeID[1]
else:
YouTubeID ='xOZM-1p-jAk'
OutputFile = f'test_audio_youtube_{YouTubeID}.m4a'
os.system(f"youtube-dl -o {OutputFile} {YouTubeID} --extract-audio --restrict-filenames -f 'bestaudio[ext=m4a]'")
result = pipe(OutputFile)
text = result['text']
output_list = []
output_list.append(text)
return text
def transcribe(audio):
text = pipe(audio)["text"]
return text
demo = gr.Blocks()
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="Whisper Small Swedish-Microphone",
description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model. An audio for recognize.",
)
inputs = [gr.Textbox(lines=1, placeholder="Link of youtube video here...", label="Input")]
outputs = gr.HighlightedText()
title="Transcription of Swedish videos"
description = "This demo uses small Whisper to transcribe what is spoken in a swedish video"
examples = ['https://www.youtube.com/watch?v=6eWhV7xYH-Q']
io = gr.Interface(fn=main_generator, inputs=inputs, outputs=outputs, title=title, description = description, examples = examples,
css= """.gr-button-primary { background: -webkit-linear-gradient(
90deg, #355764 0%, #55a8a1 100% ) !important; background: #355764;
background: linear-gradient(
90deg, #355764 0%, #55a8a1 100% ) !important;
background: -moz-linear-gradient( 90deg, #355764 0%, #55a8a1 100% ) !important;
background: -webkit-linear-gradient(
90deg, #355764 0%, #55a8a1 100% ) !important;
color:white !important}"""
)
with demo:
gr.TabbedInterface([iface, yt], ["Transcribe Audio", "Transcribe YouTube"])
demo.launch(enable_queue=True)
'''
|