Spaces:
Runtime error
Runtime error
import io | |
import whisper | |
import torch | |
import ffmpeg | |
import torchaudio | |
import streamlit as st | |
LANGUAGES = { | |
"en":"english", | |
"zh":"chinese", | |
"de":"german", | |
"es":"spanish", | |
"ru":"russian", | |
"ko":"korean", | |
"fr":"french", | |
"ja":"japanese", | |
"pt":"portuguese", | |
"tr":"turkish", | |
"pl":"polish", | |
"ca":"catalan", | |
"nl":"dutch", | |
"ar":"arabic", | |
"sv":"swedish", | |
"it":"italian", | |
"id":"indonesian", | |
"hi":"hindi", | |
"fi":"finnish", | |
"vi":"vietnamese", | |
"iw":"hebrew", | |
"uk":"ukrainian", | |
"el":"greek", | |
"ms":"malay", | |
"cs":"czech", | |
"ro":"romanian", | |
"da":"danish", | |
"hu":"hungarian", | |
"ta":"tamil", | |
"no":"norwegian", | |
"th":"thai", | |
"ur":"urdu", | |
"hr":"croatian", | |
"bg":"bulgarian", | |
"lt":"lithuanian", | |
"la":"latin", | |
"mi":"maori", | |
"ml":"malayalam", | |
"cy":"welsh", | |
"sk":"slovak", | |
"te":"telugu", | |
"fa":"persian", | |
"lv":"latvian", | |
"bn":"bengali", | |
"sr":"serbian", | |
"az":"azerbaijani", | |
"sl":"slovenian", | |
"kn":"kannada", | |
"et":"estonian", | |
"mk":"macedonian", | |
"br":"breton", | |
"eu":"basque", | |
"is":"icelandic", | |
"hy":"armenian", | |
"ne":"nepali", | |
"mn":"mongolian", | |
"bs":"bosnian", | |
"kk":"kazakh", | |
"sq":"albanian", | |
"sw":"swahili", | |
"gl":"galician", | |
"mr":"marathi", | |
"pa":"punjabi", | |
"si":"sinhala", | |
"km":"khmer", | |
"sn":"shona", | |
"yo":"yoruba", | |
"so":"somali", | |
"af":"afrikaans", | |
"oc":"occitan", | |
"ka":"georgian", | |
"be":"belarusian", | |
"tg":"tajik", | |
"sd":"sindhi", | |
"gu":"gujarati", | |
"am":"amharic", | |
"yi":"yiddish", | |
"lo":"lao", | |
"uz":"uzbek", | |
"fo":"faroese", | |
"ht":"haitian creole", | |
"ps":"pashto", | |
"tk":"turkmen", | |
"nn":"nynorsk", | |
"mt":"maltese", | |
"sa":"sanskrit", | |
"lb":"luxembourgish", | |
"my":"myanmar", | |
"bo":"tibetan", | |
"tl":"tagalog", | |
"mg":"malagasy", | |
"as":"assamese", | |
"tt":"tatar", | |
"haw":"hawaiian", | |
"ln":"lingala", | |
"ha":"hausa", | |
"ba":"bashkir", | |
"jw":"javanese", | |
"su":"sundanese", | |
} | |
def decode(model, mel, options): | |
result = whisper.decode(model, mel, options) | |
return result.text | |
def load_audio(audio): | |
print(audio.type) | |
if audio.type == "audio/wav" or audio.type == "audio/flac": | |
wave, sr = torchaudio.load(audio) | |
if sr != 16000: | |
wave = torchaudio.transforms.Resample(sr, 16000)(wave) | |
return wave.squeeze(0) | |
elif audio.type == "audio/mpeg": | |
audio = audio.read() | |
audio, _ = (ffmpeg | |
.input('pipe:0') | |
.output('pipe:1', format='wav', acodec='pcm_s16le', ac=1, ar='16k') | |
.run(capture_stdout=True, input=audio) | |
) | |
audio = io.BytesIO(audio) | |
wave, sr = torchaudio.load(audio) | |
if sr != 16000: | |
wave = torchaudio.transforms.Resample(sr, 16000)(wave) | |
return wave.squeeze(0) | |
else: | |
st.error("Unsupported audio format") | |
def detect_language(model, mel): | |
_, probs = model.detect_language(mel) | |
return max(probs, key=probs.get) | |
def main(): | |
st.title("Whisper ASR Demo") | |
st.markdown( | |
""" | |
This is a demo of OpenAI's Whisper ASR model. The model is trained on 680,000 hours of dataset. | |
""" | |
) | |
model_selection = st.sidebar.selectbox("Select model", ["tiny", "base", "small", "medium", "large"]) | |
en_model_selection = st.sidebar.checkbox("English only model", value=False) | |
if en_model_selection: | |
model_selection += ".en" | |
st.sidebar.write(f"Model: {model_selection+' (Multilingual)' if not en_model_selection else model_selection + ' (English only)'}") | |
if st.sidebar.checkbox("Show supported languages", value=False): | |
st.sidebar.info(list(LANGUAGES.values())) | |
st.sidebar.title("Options") | |
beam_size = st.sidebar.slider("Beam Size", min_value=1, max_value=10, value=5) | |
fp16 = st.sidebar.checkbox("Enable FP16 for faster transcription (It may affect performance)", value=False) | |
if not en_model_selection: | |
task = st.sidebar.selectbox("Select task", ["transcribe", "translate (To English)"], index=0) | |
else: | |
task = st.sidebar.selectbox("Select task", ["transcribe"], index=0) | |
st.title("Audio") | |
audio_file = st.file_uploader("Upload Audio", type=["wav", "mp3", "flac"]) | |
if audio_file is not None: | |
st.audio(audio_file, format=audio_file.type) | |
with st.spinner("Loading model..."): | |
model = whisper.load_model(model_selection) | |
model = model.to("cpu") if not torch.cuda.is_available() else model.to("cuda") | |
audio = load_audio(audio_file) | |
with st.spinner("Extracting features..."): | |
audio = whisper.pad_or_trim(audio) | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
if not en_model_selection: | |
with st.spinner("Detecting language..."): | |
language = detect_language(model, mel) | |
st.markdown(f"Detected Language: {LANGUAGES[language]} ({language})") | |
else: | |
language = "en" | |
configuration = {"beam_size": beam_size, "fp16": fp16, "task": task, "language": language} | |
with st.spinner("Transcribing..."): | |
options = whisper.DecodingOptions(**configuration) | |
text = decode(model, mel, options) | |
st.markdown(f"**Recognized Text:** {text}") | |
if __name__ == "__main__": | |
main() |