import numpy as np
import unicodedata
import diff_match_patch as dmp_module
from enum import Enum
import gradio as gr
from datasets import load_dataset
import pandas as pd
from jiwer import process_words, wer_default
class Action(Enum):
INSERTION = 1
DELETION = -1
EQUAL = 0
def compare_string(text1: str, text2: str) -> list:
text1_normalized = unicodedata.normalize("NFKC", text1)
text2_normalized = unicodedata.normalize("NFKC", text2)
dmp = dmp_module.diff_match_patch()
diff = dmp.diff_main(text1_normalized, text2_normalized)
dmp.diff_cleanupSemantic(diff)
return diff
def style_text(diff):
fullText = ""
for action, text in diff:
if action == Action.INSERTION.value:
fullText += f"{text}"
elif action == Action.DELETION.value:
fullText += f"{text}"
elif action == Action.EQUAL.value:
fullText += f"{text}"
else:
raise Exception("Not Implemented")
fullText = fullText.replace("](", "]\(").replace("~", "\~")
return fullText
dataset = load_dataset("distil-whisper/tedlium-long-form", split="validation")
csv = pd.read_csv("assets/large-v2.csv")
norm_target = csv["Norm Target"]
norm_pred = csv["Norm Pred"]
norm_target = [norm_target[i] for i in range(len(norm_target))]
norm_pred = [norm_pred[i] for i in range(len(norm_pred))]
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
def get_visualisation(idx):
idx -= 1
audio = dataset[idx]["audio"]
array = (audio["array"] * max_range).astype(np.int16)
sampling_rate = audio["sampling_rate"]
text1 = norm_target[idx]
text2 = norm_pred[idx]
wer_output = process_words(text1, text2, wer_default, wer_default)
wer_percentage = 100 * wer_output.wer
rel_insertions = wer_output.insertions / len(text1.split())
rel_length = len(text2.split()) / len(text1.split())
diff = compare_string(text1, text2)
full_text = style_text(diff)
return (sampling_rate, array), wer_percentage, rel_insertions, rel_length, full_text
if __name__ == "__main__":
gr.Markdown("Analyse the transcriptions generated by the Whisper large-v2 model on the TEDLIUM dev set.")
with gr.Blocks() as demo:
slider = gr.Slider(
minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
)
btn = gr.Button("Analyse")
audio_out = gr.Audio(label="Audio input")
with gr.Row():
wer = gr.Number(label="WER")
relative_insertions = gr.Number(label="Relative insertions (# insertions / target length)")
relative_length = gr.Number(label="Relative length (reference length / target length)")
text_out = gr.Markdown(label="Text difference")
btn.click(
fn=get_visualisation,
inputs=slider,
outputs=[audio_out, wer, relative_insertions, relative_length, text_out],
)
demo.launch()