import numpy as np import unicodedata import diff_match_patch as dmp_module from enum import Enum import gradio as gr from datasets import load_dataset import pandas as pd from jiwer import process_words, wer_default class Action(Enum): INSERTION = 1 DELETION = -1 EQUAL = 0 def compare_string(text1: str, text2: str) -> list: text1_normalized = unicodedata.normalize("NFKC", text1) text2_normalized = unicodedata.normalize("NFKC", text2) dmp = dmp_module.diff_match_patch() diff = dmp.diff_main(text1_normalized, text2_normalized) dmp.diff_cleanupSemantic(diff) return diff def style_text(diff): fullText = "" for action, text in diff: if action == Action.INSERTION.value: fullText += f"{text}" elif action == Action.DELETION.value: fullText += f"{text}" elif action == Action.EQUAL.value: fullText += f"{text}" else: raise Exception("Not Implemented") fullText = fullText.replace("](", "]\(").replace("~", "\~") return fullText dataset = load_dataset("distil-whisper/tedlium-long-form", split="validation") csv = pd.read_csv("assets/large-v2.csv") norm_target = csv["Norm Target"] norm_pred = csv["Norm Pred"] norm_target = [norm_target[i] for i in range(len(norm_target))] norm_pred = [norm_pred[i] for i in range(len(norm_pred))] target_dtype = np.int16 max_range = np.iinfo(target_dtype).max def get_visualisation(idx): idx -= 1 audio = dataset[idx]["audio"] array = (audio["array"] * max_range).astype(np.int16) sampling_rate = audio["sampling_rate"] text1 = norm_target[idx] text2 = norm_pred[idx] wer_output = process_words(text1, text2, wer_default, wer_default) wer_percentage = 100 * wer_output.wer rel_insertions = wer_output.insertions / len(text1.split()) rel_length = len(text2.split()) / len(text1.split()) diff = compare_string(text1, text2) full_text = style_text(diff) return (sampling_rate, array), wer_percentage, rel_insertions, rel_length, full_text if __name__ == "__main__": gr.Markdown("Analyse the transcriptions generated by the Whisper large-v2 model on the TEDLIUM dev set.") with gr.Blocks() as demo: slider = gr.Slider( minimum=1, maximum=len(norm_target), step=1, label="Dataset sample" ) btn = gr.Button("Analyse") audio_out = gr.Audio(label="Audio input") with gr.Row(): wer = gr.Number(label="WER") relative_insertions = gr.Number(label="Relative insertions (# insertions / target length)") relative_length = gr.Number(label="Relative length (reference length / target length)") text_out = gr.Markdown(label="Text difference") btn.click( fn=get_visualisation, inputs=slider, outputs=[audio_out, wer, relative_insertions, relative_length, text_out], ) demo.launch()