Spaces:

facebook
/

XLS-R-300m-EN-15

Build error

App Files Files Community

XLS-R-300m-EN-15 / app.py

patrickvonplaten

Update app.py

96bd204 about 3 years ago

raw

history blame contribute delete

3.14 kB

	import os
	os.system("pip install gradio==2.8.0b2")
	import gradio as gr
	import librosa
	from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel

	feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")
	tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_fast=False)
	model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15")

	def process_audio_file(file):
	data, sr = librosa.load(file)
	if sr != 16000:
	data = librosa.resample(data, sr, 16000)
	print(data.shape)
	input_values = feature_extractor(data, return_tensors="pt").input_values
	return input_values

	def transcribe(file_mic, file_upload, target_language):

	target_code = target_language.split("(")[-1].split(")")[0]
	forced_bos_token_id = MAPPING[target_code]

	warn_output = ""
	if (file_mic is not None) and (file_upload is not None):
	warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
	file = file_mic
	elif (file_mic is None) and (file_upload is None):
	return "ERROR: You have to either use the microphone or upload an audio file"
	elif file_mic is not None:
	file = file_mic
	else:
	file = file_upload

	input_values = process_audio_file(file)

	sequences = model.generate(input_values, forced_bos_token_id=forced_bos_token_id)

	transcription = tokenizer.batch_decode(sequences, skip_special_tokens=True)
	return warn_output + transcription[0]

	target_language = [
	"German (de)",
	"Turkish (tr)",
	"Persian (fa)",
	"Swedish (sv)",
	"Mongolian (mn)",
	"Chinese (zh)",
	"Welsh (cy)",
	"Catalan (ca)",
	"Slovenian (sl)",
	"Estonian (et)",
	"Indonesian (id)",
	"Arabic (ar)",
	"Tamil (ta)",
	"Latvian (lv)",
	"Japanese (ja)",
	]

	MAPPING = {
	"de": 250003,
	"tr": 250023,
	"fa": 250029,
	"sv": 250042,
	"mn": 250037,
	"zh": 250025,
	"cy": 250007,
	"ca": 250005,
	"sl": 250052,
	"et": 250006,
	"id": 250032,
	"ar": 250001,
	"ta": 250044,
	"lv": 250017,
	"ja": 250012,
	}

	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.inputs.Audio(source="microphone", type='filepath', optional=True),
	gr.inputs.Audio(source="upload", type='filepath', optional=True),
	gr.inputs.Dropdown(target_language),
	],
	outputs="text",
	layout="horizontal",
	theme="huggingface",
	article = "<p style='text-align: center'><a href='https://huggingface.co/facebook/wav2vec2-xls-r-300m-en-to-15' target='_blank'>Click to learn more about XLS-R-300M-EN-15 </a> \| <a href='https://arxiv.org/abs/2111.09296' target='_blank'> With 🎙️ from Facebook XLS-R </a></p>",
	title="XLS-R 300M EN-to-15 Speech Translation",
	description="A simple interface to translate English Speech to 15 possible languages.",
	enable_queue=True,
	allow_flagging=False,
	)
	iface.launch()