diffusers
/

tools

+#!/usr/bin/env python3
+from datasets import load_dataset
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import torch
+import librosa
+import jiwer
+SAMPLING_RATE = 16_000
+PATH = "../AAAS9491189740.mp3"
+REF = """This is the Science Podcast for January 12th, 2024. I'm Sarah Crespi. First up, we have journalist Rich Stone. He's back with news from his latest trip to Ukraine. This week he shares stories about environmental damage from the war, particularly the grave consequences of a dam explosion. Next on the show, producer Kevin McLean talks with researcher Nardi Gomez-Lopez about signaling between fetus and mother during childbirth, and how understanding this crosstalk might one day help predict premature labor. Finally, in a sponsored segment from our Custom Publishing office, Director of Custom Publishing, Erika Berg, talks with researcher Andrew Pospisilic about how epigenetics stabilizes particular gene expression patterns and how these patterns impact our risk for disease. We've talked about the war in Ukraine a number of times on the show with a science focus. We talked about how the war has affected science in Ukraine. Can scientists keep doing their jobs? How have facilities fared? We also had a segment on how Russian scientists are leaving that country because of disagreements with government policy and the prosecution of the war, as well as about, you know, the state of science and scientists who have remained in Russia. This week, we're going to hear about the environmental toll this war has had on Ukraine. Contributing correspondent Rich Stone visited in the fall. Hi, Rich. Welcome back to the Science Podcast. Hi, Sarah. Glad to be back. The focus, as I mentioned in the intro, is about the environment impact of the war and the scientists. But we should really just mention that this war has had an immense impact on the lives of people in Ukraine, the infrastructure. You know, there's been many deaths So we can't just ignore that and only talk about science. For sure. Hundreds of thousands of people have died during the war. It's been a tragedy for Ukraine, And certainly the ecological consequences that we're going to talk about also affect people's daily lives. Yeah, for sure. So we're going to mainly talk about the loss of this dam in Ukraine that was holding back a lot of water and formed a reservoir. It was destroyed by a pair of explosions over the summer. Can you set the scene for us here? The Dnipro River flows through the heart of Ukraine, coming into the country from the north through Belarus, into Ukrainian territory near Chernobyl, flowing through Kiev, and then down the middle of the country into the Black Sea. And it's one of the major rivers of Europe, not just Ukraine. In Soviet times, there were six dams, hydroelectric dams built on the Dnieper River. Kokoka Dam was the final dam and the one furthest downstream. And that's the one that was breached on June 6 of this year. Two explosions damaged the dam."""
+COND_TYPE = "all-segments"
+def load_model(model_id):
+    processor = AutoProcessor.from_pretrained(model_id)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16)
+    model.to("cuda")
+    return processor, model
+def load_audio(path, length_in_s=180):
+    array, _ = librosa.load(PATH, sr=SAMPLING_RATE)
+    # cut the first 45 seconds -> it's commercial
+    # only transcribe the first 5 minutes
+    array = array[: length_in_s * SAMPLING_RATE]
+    return array
+def prepare_inputs(array, processor):
+    conditioning = "Sarah Crespi, Kevin McLean, Nardhy Gómez-López, Erika Berg, Andrew Pospisilik, Kakhovka."
+    inputs = processor(array, sampling_rate=SAMPLING_RATE, padding="longest", truncation=False, return_tensors="pt")
+    inputs = inputs.to("cuda", torch.float16)
+    prompt_ids = processor.get_prompt_ids(conditioning, return_tensors="pt").to("cuda") if conditioning is not None else None
+    gen_kwargs = dict(
+        condition_on_prev_tokens=True,
+        temperature=(0.0, 0.2, 0.4, 0.6, 0.8),
+        compression_ratio_threshold=1.35,
+        logprob_threshold=-1.0,
+        prompt_ids=prompt_ids,
+        prompt_condition_type=COND_TYPE,
+    )
+    inputs = {**inputs, **gen_kwargs}
+    return inputs
+array = load_audio(PATH)
+WERS = {}
+# for model_id in ["tiny.en", "base.en", "small.en", "medium.en", "large-v2", "large-v3"]:
+for model_id in ["small.en", "medium.en", "large-v2", "large-v3"]:
+    processor, model = load_model("openai/whisper-" + model_id)
+    inputs = prepare_inputs(array, processor)
+    tokens = model.generate(**inputs)
+    transcript = processor.batch_decode(tokens)
+    pred = transcript[0]
+    WERS[model_id] = jiwer.wer(REF, pred)
+    with open(f"transcript_{model_id}.txt", "w") as f:
+        for line in transcript[0].split("."):
+            f.write(line + "\n")
+print(WERS)