File size: 4,204 Bytes
814113b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Levenshtein metric file."""

from __future__ import annotations

from typing import TYPE_CHECKING

import datasets
import evaluate

from Levenshtein import distance

if TYPE_CHECKING:
    from collections.abc import Sequence

_CITATION = """\
@InProceedings{huggingface:levenshtein,
    title = {Levenshtein (edit) distance},
    authors={Nathan Fradet},
    year={2024}
}
"""

_DESCRIPTION = """\
This metrics computes the Levenshtein (edit) distance.
It directly calls the "Levenshtein" package using the ``distance`` method:
https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance
"""


_KWARGS_DESCRIPTION = """
This metric computes the Levenshtein distance, also commonly called "edit distance".
The Levenshtein distance measures the number of combined editions, deletions and
additions to perform on a string so that it becomes identical to a second one. It is a
popular metric for text similarity.
This module directly calls the
[Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution
speed.

Args:
    predictions: list of prediction strings.
    references: list of reference strings.
    **kwargs: keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance)
        method.
Returns:
    Dictionary mapping to the average Levenshtein distance (lower is better) and the
        ratio ([0, 1]) distance (higher is better).
Examples:
    >>> levenshtein = evaluate.load("Natooz/Levenshtein")
    >>> results = levenshtein.compute(
    ...     predictions=[
    ...         "foo", "baroo"
    ...     ],
    ...     references=,[
    ...         "foo", "bar1"
    ...     ],
    ... )
    >>> print(results)
    {"levenshtein": 1, "levenshtein_ratio": 0.875}
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Levenshtein(evaluate.Metric):
    """Module for the ``distance`` method of the "Levenshtein" package."""

    def _info(self) -> evaluate.MetricInfo:
        """
        Return the module info.

        :return: module info.
        """
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string"),
                    "references": datasets.Value("string"),
                }
            ),
            # Homepage of the module for documentation
            homepage="https://huggingface.co/spaces/Natooz/Levenshtein",
            # Additional links to the codebase or references
            codebase_urls=[
                "https://github.com/rapidfuzz/Levenshtein",
            ],
            reference_urls=[
                "https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance"
            ],
        )

    def _compute(
        self,
        predictions: Sequence[float] | None = None,
        references: Sequence[int] | None = None,
        **kwargs,
    ) -> dict[str, float]:
        """
        Return the average Levenshtein (edit) distance.

        See the "Levenshtein" PyPi package documentation for the complete usage
        information: https://rapidfuzz.github.io/Levenshtein/
        """
        if len(predictions) != len(references):
            msg = "The number of predictions must be equal to the number of references."
            raise ValueError(msg)

        # Compute the distances
        results, ratios = [], []
        for prediction, reference in zip(predictions, references):
            edit_distance = distance(prediction, reference, **kwargs)
            results.append(edit_distance)
            ratios.append(edit_distance / (len(prediction) + len(reference)))

        # Return average distance and ratio
        return {
            "levenshtein": sum(results) / len(results),
            "levenshtein_ratio": 1 - sum(ratios) / len(ratios),
        }