File size: 2,970 Bytes
8c92a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import torch

from torchmetrics import WordErrorRate


def extract_wer(
    model,
    **kwargs,
):
    """Compute Word Error Rate (WER) between the predicted and the ground truth audio.
    content_gt: the ground truth content.
    audio_ref: path to the ground truth audio.
    audio_deg: path to the predicted audio.
    mode: "gt_content" computes the WER between the predicted content obtained from the whisper model and the ground truth content.
          both content_gt and audio_deg are needed.
          "gt_audio" computes the WER between the extracted ground truth and predicted contents obtained from the whisper model.
          both audio_ref and audio_deg are needed.
    """
    kwargs = kwargs["kwargs"]
    mode = kwargs["intelligibility_mode"]
    language = kwargs["language"]
    wer = WordErrorRate()

    if torch.cuda.is_available():
        device = torch.device("cuda")
        wer = wer.to(device)

    # Get ground truth content
    if mode == "gt_content":
        content_gt = kwargs["content_gt"]
        audio_deg = kwargs["audio_deg"]

        if language == "chinese":
            prompt = "ไปฅไธ‹ๆ˜ฏๆ™ฎ้€š่ฏ็š„ๅฅๅญ"
            result_deg = model.transcribe(
                audio_deg, language="zh", verbose=True, initial_prompt=prompt
            )
        else:
            result_deg = model.transcribe(audio_deg, verbose=True)
    elif mode == "gt_audio":
        audio_ref = kwargs["audio_ref"]
        audio_deg = kwargs["audio_deg"]

        if language == "chinese":
            prompt = "ไปฅไธ‹ๆ˜ฏๆ™ฎ้€š่ฏ็š„ๅฅๅญ"
            result_ref = model.transcribe(
                audio_ref, language="zh", verbose=True, initial_prompt=prompt
            )
            result_deg = model.transcribe(
                audio_deg, language="zh", verbose=True, initial_prompt=prompt
            )
        else:
            result_ref = model.transcribe(audio_deg, verbose=True)
            result_deg = model.transcribe(audio_deg, verbose=True)

        content_gt = result_ref["text"]

    content_gt = content_gt.replace(" ", "")
    content_gt = content_gt.replace(".", "")
    content_gt = content_gt.replace("'", "")
    content_gt = content_gt.replace("-", "")
    content_gt = content_gt.replace(",", "")
    content_gt = content_gt.replace("!", "")
    content_gt = content_gt.lower()

    # Get predicted truth content
    content_pred = result_deg["text"]
    content_pred = content_pred.replace(" ", "")
    content_pred = content_pred.replace(".", "")
    content_pred = content_pred.replace("'", "")
    content_pred = content_pred.replace("-", "")
    content_pred = content_pred.replace(",", "")
    content_pred = content_pred.replace("!", "")
    content_pred = content_pred.lower()

    return wer(content_pred, content_gt).detach().cpu().numpy().tolist()