import io import whisper import torch import ffmpeg import torchaudio import streamlit as st LANGUAGES = { "en":"english", "zh":"chinese", "de":"german", "es":"spanish", "ru":"russian", "ko":"korean", "fr":"french", "ja":"japanese", "pt":"portuguese", "tr":"turkish", "pl":"polish", "ca":"catalan", "nl":"dutch", "ar":"arabic", "sv":"swedish", "it":"italian", "id":"indonesian", "hi":"hindi", "fi":"finnish", "vi":"vietnamese", "iw":"hebrew", "uk":"ukrainian", "el":"greek", "ms":"malay", "cs":"czech", "ro":"romanian", "da":"danish", "hu":"hungarian", "ta":"tamil", "no":"norwegian", "th":"thai", "ur":"urdu", "hr":"croatian", "bg":"bulgarian", "lt":"lithuanian", "la":"latin", "mi":"maori", "ml":"malayalam", "cy":"welsh", "sk":"slovak", "te":"telugu", "fa":"persian", "lv":"latvian", "bn":"bengali", "sr":"serbian", "az":"azerbaijani", "sl":"slovenian", "kn":"kannada", "et":"estonian", "mk":"macedonian", "br":"breton", "eu":"basque", "is":"icelandic", "hy":"armenian", "ne":"nepali", "mn":"mongolian", "bs":"bosnian", "kk":"kazakh", "sq":"albanian", "sw":"swahili", "gl":"galician", "mr":"marathi", "pa":"punjabi", "si":"sinhala", "km":"khmer", "sn":"shona", "yo":"yoruba", "so":"somali", "af":"afrikaans", "oc":"occitan", "ka":"georgian", "be":"belarusian", "tg":"tajik", "sd":"sindhi", "gu":"gujarati", "am":"amharic", "yi":"yiddish", "lo":"lao", "uz":"uzbek", "fo":"faroese", "ht":"haitian creole", "ps":"pashto", "tk":"turkmen", "nn":"nynorsk", "mt":"maltese", "sa":"sanskrit", "lb":"luxembourgish", "my":"myanmar", "bo":"tibetan", "tl":"tagalog", "mg":"malagasy", "as":"assamese", "tt":"tatar", "haw":"hawaiian", "ln":"lingala", "ha":"hausa", "ba":"bashkir", "jw":"javanese", "su":"sundanese", } def decode(model, mel, options): result = whisper.decode(model, mel, options) return result.text def load_audio(audio): print(audio.type) if audio.type == "audio/wav" or audio.type == "audio/flac": wave, sr = torchaudio.load(audio) if sr != 16000: wave = torchaudio.transforms.Resample(sr, 16000)(wave) return wave.squeeze(0) elif audio.type == "audio/mpeg": audio = audio.read() audio, _ = (ffmpeg .input('pipe:0') .output('pipe:1', format='wav', acodec='pcm_s16le', ac=1, ar='16k') .run(capture_stdout=True, input=audio) ) audio = io.BytesIO(audio) wave, sr = torchaudio.load(audio) if sr != 16000: wave = torchaudio.transforms.Resample(sr, 16000)(wave) return wave.squeeze(0) else: st.error("Unsupported audio format") def detect_language(model, mel): _, probs = model.detect_language(mel) return max(probs, key=probs.get) def main(): st.title("Whisper ASR Demo") st.markdown( """ This is a demo of OpenAI's Whisper ASR model. The model is trained on 680,000 hours of dataset. """ ) model_selection = st.sidebar.selectbox("Select model", ["tiny", "base", "small", "medium", "large"]) en_model_selection = st.sidebar.checkbox("English only model", value=False) if en_model_selection: model_selection += ".en" st.sidebar.write(f"Model: {model_selection+' (Multilingual)' if not en_model_selection else model_selection + ' (English only)'}") if st.sidebar.checkbox("Show supported languages", value=False): st.sidebar.info(list(LANGUAGES.values())) st.sidebar.title("Options") beam_size = st.sidebar.slider("Beam Size", min_value=1, max_value=10, value=5) fp16 = st.sidebar.checkbox("Enable FP16 for faster transcription (It may affect performance)", value=False) if not en_model_selection: task = st.sidebar.selectbox("Select task", ["transcribe", "translate (To English)"], index=0) else: task = st.sidebar.selectbox("Select task", ["transcribe"], index=0) st.title("Audio") audio_file = st.file_uploader("Upload Audio", type=["wav", "mp3", "flac"]) if audio_file is not None: st.audio(audio_file, format=audio_file.type) with st.spinner("Loading model..."): model = whisper.load_model(model_selection) model = model.to("cpu") if not torch.cuda.is_available() else model.to("cuda") audio = load_audio(audio_file) with st.spinner("Extracting features..."): audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) if not en_model_selection: with st.spinner("Detecting language..."): language = detect_language(model, mel) st.markdown(f"Detected Language: {LANGUAGES[language]} ({language})") else: language = "en" configuration = {"beam_size": beam_size, "fp16": fp16, "task": task, "language": language} with st.spinner("Transcribing..."): options = whisper.DecodingOptions(**configuration) text = decode(model, mel, options) st.markdown(f"**Recognized Text:** {text}") if __name__ == "__main__": main()