File size: 7,688 Bytes
c668e80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
from onmt.utils.logging import logger
from onmt.transforms import register_transform
from .transform import Transform

from rapidfuzz import fuzz, process
import numpy as np
import time


class FuzzyMatcher(object):
    """Class for creating and setting up fuzzy matchers."""

    def __init__(
        self,
        tm_path,
        corpus_ratio,
        threshold=70,
        tm_delimiter="\t",
        fuzzy_token="⦅fuzzy⦆",
        tm_unit_min_lentgh=4,
        tm_unit_max_length=70,
    ):
        self.threshold = threshold
        self.corpus_ratio = corpus_ratio
        self.tm_delimiter = tm_delimiter
        self.fuzzy_token = fuzzy_token
        self.tm_unit_min_length = tm_unit_min_lentgh
        self.tm_unit_max_length = tm_unit_max_length
        self.internal_tm = self._create_tm(tm_path)

    def _create_tm(self, tm_path):
        """The TM should be a utf-8 text file with each line
        containing a source sentence and its translation, separated
        by the `self.tm_delimiter`. A TM size of 200k-250k pairs should
        provide enough matches and good performance, but this may
        depend on overall system specs (RAM, CPU)
        """

        src_segments, tgt_segments = list(), list()
        with open(tm_path, mode="r", encoding="utf-8") as file:
            pairs = file.readlines()
            for pair in pairs:
                source, target = map(str, pair.split(self.tm_delimiter))

                # Filter out very short or very long sentences
                # from the TM for better performance
                if (
                    len(source) < self.tm_unit_min_length
                    or len(source) > self.tm_unit_max_length
                ):
                    continue
                src_segments.append(source.strip())
                tgt_segments.append(target.strip())
        logger.debug(
            f"Translation Memory size for FuzzyMatch transform: " f"{len(src_segments)}"
        )
        return [src_segments, tgt_segments]

    def _get_batch_matches(self, batch):
        logger.debug(f"Starting fuzzy matching on {len(batch)} examples")
        fuzzy_count = 0
        start = time.time()
        augmented = list()

        # We split the `batch` and perform fuzzy matching
        # in smaller chunks of 10.000 examples in order to
        # reduce memory usage.
        # Perfomance is not affected.
        chunk_size = 10000
        mini_batches = np.array_split(
            batch, len(batch) // chunk_size if len(batch) > chunk_size else 1
        )
        for mini_batch in mini_batches:
            plist = list(mini_batch)
            if fuzzy_count >= len(batch) * self.corpus_ratio:
                augmented.extend(plist)
                continue

            results = process.cdist(
                plist,
                self.internal_tm[0],
                scorer=fuzz.ratio,
                dtype=np.uint8,
                score_cutoff=self.threshold,
                workers=-1,
            )

            matches = np.any(results, 1)
            argmax = np.argmax(results, axis=1)
            for idx, s in enumerate(plist):
                # Probably redundant but let's be safe
                # in case some examples are already fuzzied
                # (e.g. from another pipeline or workflow)
                if self.fuzzy_token in s:
                    continue
                # We don't want exact matches
                if matches[idx] and results[idx][argmax[idx]] < 100:
                    if fuzzy_count >= len(batch) * self.corpus_ratio:
                        break
                    plist[idx] = s + self.fuzzy_token + self.internal_tm[1][argmax[idx]]
                    fuzzy_count += 1
            augmented.extend(plist)

        end = time.time()
        logger.debug(
            f"FuzzyMatch Transform: Added {fuzzy_count} " f"fuzzies in {end-start} secs"
        )

        return augmented


@register_transform(name="fuzzymatch")
class FuzzyMatchTransform(Transform):
    """Perform fuzzy matching against a translation memory and
    augment source examples with target matches for Neural Fuzzy Repair.
    :cite:`bulte-tezcan-2019-neural`
    """

    def __init__(self, opts):
        super().__init__(opts)

    @classmethod
    def add_options(cls, parser):
        """Options for fuzzy matching."""

        group = parser.add_argument_group("Transform/FuzzyMatching")
        group.add("--tm_path", "-tm_path", type=str, help="Path to a flat text TM.")
        group.add(
            "--fuzzy_corpus_ratio",
            "-fuzzy_corpus_ratio",
            type=float,
            default=0.1,
            help="Ratio of corpus to augment with fuzzy matches.",
        )
        group.add(
            "--fuzzy_threshold",
            "-fuzzy_threshold",
            type=int,
            default=70,
            help="The fuzzy matching threshold.",
        )
        group.add(
            "--tm_delimiter",
            "-tm_delimiter",
            type=str,
            default="\t",
            help="The delimiter used in the flat text TM.",
        )
        group.add(
            "--fuzzy_token",
            "-fuzzy_token",
            type=str,
            default="⦅fuzzy⦆",
            help="The fuzzy token to be added with the matches.",
        )
        group.add(
            "--fuzzymatch_min_length",
            "-fuzzymatch_min_length",
            type=int,
            default=4,
            help="Min length for TM entries and examples to match.",
        )
        group.add(
            "--fuzzymatch_max_length",
            "-fuzzymatch_max_length",
            type=int,
            default=70,
            help="Max length for TM entries and examples to match.",
        )

    def _parse_opts(self):
        self.tm_path = self.opts.tm_path
        self.fuzzy_corpus_ratio = self.opts.fuzzy_corpus_ratio
        self.fuzzy_threshold = self.opts.fuzzy_threshold
        self.tm_delimiter = self.opts.tm_delimiter
        self.fuzzy_token = self.opts.fuzzy_token
        self.fuzzymatch_min_length = self.opts.fuzzymatch_min_length
        self.fuzzymatch_max_length = self.opts.fuzzymatch_max_length

    @classmethod
    def get_specials(cls, opts):
        """Add the fuzzy match token to the src vocab."""

        return ([opts.fuzzy_token], list())

    def warm_up(self, vocabs=None):
        """Create the fuzzy matcher."""

        super().warm_up(None)
        self.matcher = FuzzyMatcher(
            self.tm_path,
            self.fuzzy_corpus_ratio,
            self.fuzzy_threshold,
            self.tm_delimiter,
            self.fuzzy_token,
            self.fuzzymatch_min_length,
            self.fuzzymatch_max_length,
        )

    def apply(self, example, is_train=False, stats=None, **kwargs):
        return example

    def batch_apply(self, batch, is_train=False, stats=None, **kwargs):
        src_segments = list()
        for ex, _, _ in batch:
            # Apply a basic filtering to leave out very short or very long
            # sentences and speed up things a bit during fuzzy matching
            if (
                len(" ".join(ex["src"])) > self.fuzzymatch_min_length
                and len(" ".join(ex["src"])) < self.fuzzymatch_max_length
            ):
                src_segments.append(" ".join(ex["src"]))
            else:
                src_segments.append("")
        fuzzied_src = self.matcher._get_batch_matches(src_segments)
        assert len(src_segments) == len(fuzzied_src)
        for idx, (example, _, _) in enumerate(batch):
            if fuzzied_src[idx] != "":
                example["src"] = fuzzied_src[idx].split()

        return batch