|
""" Character Error Ratio (CER) metric. """ |
|
from typing import List |
|
import datasets, evaluate , jiwer |
|
import jiwer.transforms as tr |
|
from datasets.config import PY_VERSION |
|
from packaging import version |
|
|
|
|
|
if PY_VERSION < version.parse("3.8"): |
|
import importlib_metadata |
|
else: |
|
import importlib.metadata as importlib_metadata |
|
|
|
SENTENCE_DELIMITER = "" |
|
|
|
if version.parse(importlib_metadata.version("jiwer")) < version.parse("2.3.0"): |
|
|
|
class SentencesToListOfCharacters(tr.AbstractTransform): |
|
def __init__(self, sentence_delimiter: str = " "): |
|
self.sentence_delimiter = sentence_delimiter |
|
|
|
def process_string(self, s: str): |
|
return list(s) |
|
|
|
def process_list(self, inp: List[str]): |
|
chars = [] |
|
for sent_idx, sentence in enumerate(inp): |
|
chars.extend(self.process_string(sentence)) |
|
if self.sentence_delimiter is not None and self.sentence_delimiter != "" and sent_idx < len(inp) - 1: |
|
chars.append(self.sentence_delimiter) |
|
return chars |
|
|
|
cer_transform = tr.Compose( |
|
[tr.RemoveMultipleSpaces(), tr.Strip(), SentencesToListOfCharacters(SENTENCE_DELIMITER)] |
|
) |
|
else: |
|
cer_transform = tr.Compose( |
|
[ |
|
tr.RemoveMultipleSpaces(), |
|
tr.Strip(), |
|
tr.ReduceToSingleSentence(SENTENCE_DELIMITER), |
|
tr.ReduceToListOfListOfChars(), |
|
] |
|
) |
|
|
|
|
|
_CITATION = """\ |
|
@inproceedings{inproceedings, |
|
author = {Morris, Andrew and Maier, Viktoria and Green, Phil}, |
|
year = {2004}, |
|
month = {01}, |
|
pages = {}, |
|
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.} |
|
} |
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
Character error rate (CER) is a standard metric of the performance of an automatic speech recognition system. |
|
|
|
CER is similar to Word Error Rate (WER) but operates on characters instead of words. Please refer to the docs of WER for further information. |
|
|
|
The character error rate can be computed as: |
|
|
|
CER = (S + D + I) / N = (S + D + I) / (S + D + C) |
|
|
|
where |
|
|
|
S is the number of substitutions, |
|
D is the number of deletions, |
|
I is the number of insertions, |
|
C is the number of correct characters, |
|
N is the number of characters in the reference (N=S+D+C). |
|
|
|
CER's output is not always a number between 0 and 1, particularly when there is a high number of insertions. This value is often associated to the percentage of characters that were incorrectly predicted. The lower the value, the better the |
|
performance of the ASR system with a CER of 0 being a perfect score. |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Computes CER score of transcribed segments against references. |
|
Args: |
|
references: list of references for each speech input. |
|
predictions: list of transcriptions to score. |
|
concatenate_texts: Whether or not to concatenate sentences before evaluation, set to True for a more accurate result. |
|
Returns: |
|
(float): the character error rate |
|
|
|
Examples for the Hungarian Language: |
|
>>> # Colab usage |
|
>>> !pip install evaluate jiwer |
|
>>> import evaluate |
|
>>> from evaluate import load |
|
|
|
>>> predictions = ["ez a jóslat", "van egy másik minta is"] |
|
>>> references = ["ez a hivatkozás", "van még egy"] |
|
>>> cer = evaluate.load("cer") |
|
>>> cer_score = cer.compute(predictions=predictions, references=references) |
|
>>> print(cer_score) |
|
>>> 0.9615384615384616 |
|
""" |
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class CER(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Value("string", id="sequence"), |
|
} |
|
), |
|
codebase_urls=["https://github.com/jitsi/jiwer/"], |
|
reference_urls=[ |
|
"https://en.wikipedia.org/wiki/Word_error_rate", |
|
"https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates", |
|
], |
|
) |
|
|
|
def _compute(self, predictions, references, concatenate_texts=False): |
|
if concatenate_texts: |
|
return jiwer.compute_measures( |
|
references, |
|
predictions, |
|
truth_transform=cer_transform, |
|
hypothesis_transform=cer_transform, |
|
)["wer"] |
|
|
|
incorrect = 0 |
|
total = 0 |
|
for prediction, reference in zip(predictions, references): |
|
measures = jiwer.compute_measures( |
|
reference, |
|
prediction, |
|
truth_transform=cer_transform, |
|
hypothesis_transform=cer_transform, |
|
) |
|
incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"] |
|
total += measures["substitutions"] + measures["deletions"] + measures["hits"] |
|
|
|
return incorrect / total |