Spaces:
Running
Running
import numpy as np | |
import streamlit as st | |
from constants import WHISPER_MODELS, language_dict | |
import streamlit as st | |
from utils import translate_to_english, detect_language, write, read, get_key | |
import whisperx as whisper | |
import json | |
import pandas as pd | |
from pydub import AudioSegment | |
import os | |
if "btn1" not in st.session_state: | |
st.session_state["btn1"] = False | |
if "btn2" not in st.session_state: | |
st.session_state["btn2"] = False | |
class ByteEncoder(json.JSONEncoder): | |
def default(self, obj): | |
if isinstance(obj, bytes): | |
return obj.hex() | |
return json.JSONEncoder.default(self, obj) | |
def disable_btn2(): | |
st.session_state["btn2"] = True | |
def disable_btn1(): | |
st.session_state["btn1"] = True | |
st.set_page_config(page_title="Whisper-X", layout="wide") | |
import torch | |
if torch.cuda.is_available(): | |
device = "gpu" | |
else: | |
device = "cpu" | |
input, output = st.columns(2, gap="medium") | |
with input: | |
st.header("Input") | |
audio_file = open("audio.wav", "rb") | |
audio_bytes = audio_file.read() | |
# st.markdown("""**sample audio**""", unsafe_allow_html=True) | |
st.audio(audio_bytes, format="audio/wav") | |
# st.markdown("""**your audio file**""", unsafe_allow_html=True) | |
audio_uploaded = st.file_uploader( | |
label="Upload your file", | |
type=["mp3", "wav"], | |
help="Your input file", | |
# on_change=disable_btn2, | |
# disabled=st.session_state["btn1"], | |
) | |
# text_json = st.file_uploader( | |
# label="Aligned JSON", | |
# type=["json"], | |
# help="Your aligned json file", | |
# # disabled=st.session_state["btn2"], | |
# # on_change=disable_btn1, | |
# ) | |
text_json = None | |
# st.markdown("""**model**""", unsafe_allow_html=True) | |
model_name = st.selectbox( | |
label="Choose your model", | |
options=WHISPER_MODELS, | |
help="Choose a Whisper model.", | |
) | |
model_name = "base" if model_name == "" else model_name | |
# st.markdown("**transcription**", unsafe_allow_html=True) | |
transcription = st.selectbox( | |
"transcription", | |
options=["plain text", "srt", "vtt", "ass", "tsv"], | |
help="Choose the format for the transcription", | |
) | |
translate = st.checkbox( | |
"translate", help="Translate the text to English when set to True" | |
) | |
language = st.selectbox( | |
label="language", | |
options=list(language_dict.keys()) + list(language_dict.values()), | |
help="Translate the text to English when set to True", | |
) | |
patience = st.number_input( | |
label="patience", | |
step=0.01, | |
value=1.0, | |
help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search", | |
) | |
temperature = st.number_input( | |
label="temperature", | |
step=0.01, | |
value=1.0, | |
help="temperature to use for sampling", | |
) | |
suppress_tokens = st.text_input( | |
"suppress_tokens", | |
value="-1", | |
help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations", | |
) | |
initial_prompt = st.text_area( | |
label="initial_prompt", | |
help="optional text to provide as a prompt for the first window.", | |
) | |
condition_on_previous_text = st.checkbox( | |
"condition_on_previous_text", | |
help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop", | |
) | |
temperature_increment_on_fallback = st.number_input( | |
label="temperature_increment_on_fallback", | |
step=0.01, | |
value=0.2, | |
help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below", | |
) | |
compression_ratio_threshold = st.number_input( | |
label="compression_ratio_threshold", | |
value=2.4, | |
step=0.01, | |
help="if the gzip compression ratio is higher than this value, treat the decoding as failed", | |
) | |
logprob_threshold = st.number_input( | |
label="logprob_threshold", | |
value=-1.0, | |
step=0.01, | |
help="if the average log probability is lower than this value, treat the decoding as failed", | |
) | |
no_speech_threshold = st.number_input( | |
label="no_speech_threshold", | |
value=0.6, | |
step=0.01, | |
help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence", | |
) | |
if temperature_increment_on_fallback is not None: | |
temperature = tuple( | |
np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback) | |
) | |
else: | |
temperature = [temperature] | |
try: | |
if len(temperature) == 0: | |
st.error("Choose correct value for temperature") | |
except: | |
pass | |
# st.write(temperature) | |
submit = st.button("Submit", type="primary") | |
with output: | |
st.header("Output") | |
import uuid | |
name = str(uuid.uuid1()) | |
if submit: | |
if audio_uploaded is None: | |
# st.audio(audio_bytes, format="audio/wav") | |
audio_uploaded = audio_file | |
if audio_uploaded is not None: | |
if audio_uploaded.name.endswith(".wav"): | |
temp = AudioSegment.from_wav(audio_uploaded) | |
temp.export(f"{name}.wav") | |
if audio_uploaded.name.endswith(".mp3"): | |
temp = AudioSegment.from_wav(audio_uploaded) | |
temp.export(f"{name}.wav") | |
# audio_bytes = audio_uploaded.read() | |
# st.audio(audio_bytes, format="audio/wav") | |
if language == "": | |
model = whisper.load_model(model_name) | |
with st.spinner("Detecting language..."): | |
detection = detect_language(f"{name}.wav", model) | |
language = detection.get("detected_language") | |
del model | |
# st.write(language) | |
if len(language) > 2: | |
language = get_key(language) | |
segments_pre = st.empty() | |
segments_post = st.empty() | |
segments_post_json = st.empty() | |
segments_post2 = st.empty() | |
trans = st.empty() | |
lang = st.empty() | |
if text_json is None: | |
with st.spinner("Running ... "): | |
decode = {"suppress_tokens": suppress_tokens, "beam_size": 5} | |
model = whisper.load_model(model_name) | |
with st.container(): | |
with st.spinner(f"Running with {model_name} model"): | |
result = model.transcribe( | |
f"{name}.wav", | |
language=language, | |
patience=patience, | |
initial_prompt=initial_prompt, | |
condition_on_previous_text=condition_on_previous_text, | |
temperature=temperature, | |
compression_ratio_threshold=compression_ratio_threshold, | |
logprob_threshold=logprob_threshold, | |
no_speech_threshold=no_speech_threshold, | |
**decode, | |
) | |
if translate: | |
result = translate_to_english(result, json=False) | |
with open("transcription.json", "w") as f: | |
json.dump(result["segments"], f, indent=4, cls=ByteEncoder) | |
with st.spinner("Running alignment model ..."): | |
model_a, metadata = whisper.load_align_model( | |
language_code=result["language"], device=device | |
) | |
result_aligned = whisper.align( | |
result["segments"], | |
model_a, | |
metadata, | |
f"{name}.wav", | |
device=device, | |
) | |
if text_json is not None: | |
if translate: | |
result = translate_to_english(text_json, json=True) | |
with st.spinner("Running alignment model ..."): | |
model_a, metadata = whisper.load_align_model( | |
language_code=language, device=device | |
) | |
result_aligned = whisper.align( | |
text_json, model_a, metadata, audio_uploaded.name, device | |
) | |
if text_json is None: | |
words_segments = result_aligned["word_segments"] | |
write( | |
f"{name}.wav", | |
dtype=transcription, | |
result_aligned=result_aligned, | |
) | |
trans_text = read(f"{name}.wav", transcription) | |
trans.text_area( | |
"transcription", trans_text, height=None, max_chars=None, key=None | |
) | |
segments_pre.text_area( | |
"Segments before alignment", | |
result["segments"], | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
segments_post.text_area( | |
"Word Segments after alignment", | |
result_aligned["word_segments"], | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
with open("segments.json", "w", encoding="utf-8") as f: | |
json.dump(result_aligned["word_segments"], f, indent=False) | |
segments_post2.text_area( | |
"Segments after alignment", | |
result_aligned["segments"], | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
lang.text_input( | |
"detected language", language_dict.get(language), disabled=True | |
) | |
os.remove(f"{name}.wav") |