File size: 2,803 Bytes
46e3fab
c2f1466
46e3fab
 
b0d9242
46e3fab
 
 
 
 
6d9ef63
46e3fab
 
 
 
 
 
 
 
 
 
 
1979413
46e3fab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import gradio as gr
import whisper
from whisper import tokenizer
import time

current_size = 'base'
model = whisper.load_model(current_size)
AUTO_DETECT_LANG = "Auto Detect"

def transcribe(audio, state={}, model_size='tiny', delay=1.2, lang=None, translate=False):
    time.sleep(delay - 1)

    global current_size
    global model
    if model_size != current_size:
        current_size = model_size
        model = whisper.load_model(current_size)
  
    transcription = model.transcribe(
        audio,
        language = lang if lang != AUTO_DETECT_LANG else None
    )
    state['transcription'] += transcription['text'] + " "

    if translate:
        x = whisper.load_audio(audio)
        x = whisper.pad_or_trim(x)
        mel = whisper.log_mel_spectrogram(x).to(model.device)

        options = whisper.DecodingOptions(task = "translation")
        translation = whisper.decode(model, mel, options)

        state['translation'] += translation.text + " "

    return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
  

title = "OpenAI's Whisper Real-time Demo"
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model. This demo runs on a CPU. For faster inference choose 'tiny' model size and set the language explicitly."

model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value='base')

delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=1.2, label="Rate of transcription")

available_languages = sorted(tokenizer.TO_LANGUAGE_CODE.keys())
available_languages = [lang.capitalize() for lang in available_languages]
available_languages = [AUTO_DETECT_LANG]+available_languages

lang_dropdown = gr.inputs.Dropdown(choices=available_languages, label="Language", default=AUTO_DETECT_LANG, type="value")

if lang_dropdown==AUTO_DETECT_LANG:
    lang_dropdown=None

translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)



transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
detected_lang = gr.outputs.HTML(label="Detected Language")

state = gr.State({"transcription": "", "translation": ""})

gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        state,
        model_size,
        delay_slider,
        lang_dropdown,
        translate_checkbox
        ], 
    outputs=[
        transcription_tb,
        translation_tb,
        state,
        detected_lang
    ],
    live=True,
    allow_flagging='never',
    title=title,
    description=description,
).launch(
    # enable_queue=True,
    # debug=True
  )