Spaces:

irena
/

ASR_ID2223

Runtime error

App Files Files Community

ASR_ID2223 / app.py

irena

Update app.py

77f334e about 2 years ago

raw

history blame

6.57 kB


	import os
	import gradio as gr
	from transformers import pipeline
	from pytube import YouTube
	from datasets import Dataset, Audio
	from moviepy.editor import AudioFileClip

	pipe = pipeline(model="irena/whisper-small-sv-SE")

	def download_from_youtube(url):
	"""
	Downloads the video from the given YouTube URL and returns the path to the audio file.
	"""
	streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
	fpath = streams.first().download()
	return fpath

	def get_timestamp(seconds):
	"""
	Creates %M:%S timestamp from seconds.
	"""
	minutes = int(seconds / 60)
	seconds = int(seconds % 60)
	return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"

	def divide_into_30s_segments(audio_fpath, seconds_max):
	"""
	Divides the audio file into 30s segments and returns the paths to the segments and the start times of the segments.
	:param audio_fpath: Path to the audio file.
	:param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
	"""
	if not os.path.exists("segmented_audios"):
	os.makedirs("segmented_audios")

	sound = AudioFileClip(audio_fpath)
	n_full_segments = int(sound.duration / 30)
	len_last_segment = sound.duration % 30

	max_segments = int(seconds_max / 30)
	if n_full_segments > max_segments:
	n_full_segments = max_segments
	len_last_segment = 0

	segment_paths = []
	segment_start_times = []

	segments_available = n_full_segments + 1
	for i in range(min(segments_available, max_segments)):
	start = i * 30

	# Skip last segment if it is smaller than two seconds
	is_last_segment = i == n_full_segments
	if is_last_segment and not len_last_segment > 2:
	continue
	elif is_last_segment:
	end = start + len_last_segment
	else:
	end = (i + 1) * 30

	segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
	segment = sound.subclip(start, end)
	segment.write_audiofile(segment_path)
	segment_paths.append(segment_path)
	segment_start_times.append(start)

	return segment_paths, segment_start_times
	def get_translation(text):
	"""
	Translates the given Chinese text to English.
	"""
	return "TODO: Make API call to Google Translate to get English translation"

	def transcribe(audio, url, seconds_max):
	"""
	Transcribes a YouTube video if a url is specified and returns the transcription.
	If not url is specified, it transcribes the audio file as passed by Gradio.
	:param audio: Audio file as passed by Gradio. Only used if no url is specified.
	:param url: YouTube URL to transcribe.
	:param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
	"""
	if url:
	fpath = download_from_youtube(url)
	segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)

	audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
	pred = pipe(audio_dataset["audio"])
	text = ""
	n_segments = len(segment_start_times)
	for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
	text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
	text += f"{output['text']}\n"
	text += f"[Translation]\n{get_translation(output['text'])}\n\n"
	return text

	else:
	text = pipe(audio)["text"]
	return text

	block = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
	gr.Text(max_lines=1, placeholder="Enter YouTube Link which has a Chinese video", label="Transcribe from YouTube URL"),
	gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to transcribe from YouTube URL")
	],
	outputs="text",
	title="Whisper Small Chinese",
	description="Realtime Chinese speech recognition",
	)

	block.launch()























	'''
	import os
	import gradio as gr
	from transformers import pipeline
	import gradio as gr
	import torch
	import spacy

	os.system('pip install https://huggingface.co/Armandoliv/es_pipeline/resolve/main/es_pipeline-any-py3-none-any.whl')

	pipe = pipeline(model="irena/whisper-small-sv-SE")
	nlp_ner = spacy.load("es_pipeline")
	def main_generator(youtube_id:str):
	YouTubeID = youtube_id.split("https://www.youtube.com/watch?v=") #
	if len(YouTubeID)>1:
	YouTubeID = YouTubeID[1]
	else:
	YouTubeID ='xOZM-1p-jAk'

	OutputFile = f'test_audio_youtube_{YouTubeID}.m4a'

	os.system(f"youtube-dl -o {OutputFile} {YouTubeID} --extract-audio --restrict-filenames -f 'bestaudio[ext=m4a]'")

	result = pipe(OutputFile)
	text = result['text']

	output_list = []

	output_list.append(text)

	return text



	def transcribe(audio):
	text = pipe(audio)["text"]
	return text

	demo = gr.Blocks()


	iface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs="text",
	title="Whisper Small Swedish-Microphone",
	description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model. An audio for recognize.",
	)

	inputs = [gr.Textbox(lines=1, placeholder="Link of youtube video here...", label="Input")]
	outputs = gr.HighlightedText()
	title="Transcription of Swedish videos"
	description = "This demo uses small Whisper to transcribe what is spoken in a swedish video"
	examples = ['https://www.youtube.com/watch?v=6eWhV7xYH-Q']
	io = gr.Interface(fn=main_generator, inputs=inputs, outputs=outputs, title=title, description = description, examples = examples,

	css= """.gr-button-primary { background: -webkit-linear-gradient(
	90deg, #355764 0%, #55a8a1 100% ) !important; background: #355764;
	background: linear-gradient(
	90deg, #355764 0%, #55a8a1 100% ) !important;
	background: -moz-linear-gradient( 90deg, #355764 0%, #55a8a1 100% ) !important;
	background: -webkit-linear-gradient(
	90deg, #355764 0%, #55a8a1 100% ) !important;
	color:white !important}"""
	)


	with demo:
	gr.TabbedInterface([iface, yt], ["Transcribe Audio", "Transcribe YouTube"])

	demo.launch(enable_queue=True)

	'''