Spaces:

Oopstom
/

ReactSeq

Sleeping

File size: 7,688 Bytes

c668e80

from onmt.utils.logging import logger
from onmt.transforms import register_transform
from .transform import Transform

from rapidfuzz import fuzz, process
import numpy as np
import time


class FuzzyMatcher(object):
    """Class for creating and setting up fuzzy matchers."""

    def __init__(
        self,
        tm_path,
        corpus_ratio,
        threshold=70,
        tm_delimiter="\t",
        fuzzy_token="｟fuzzy｠",
        tm_unit_min_lentgh=4,
        tm_unit_max_length=70,
    ):
        self.threshold = threshold
        self.corpus_ratio = corpus_ratio
        self.tm_delimiter = tm_delimiter
        self.fuzzy_token = fuzzy_token
        self.tm_unit_min_length = tm_unit_min_lentgh
        self.tm_unit_max_length = tm_unit_max_length
        self.internal_tm = self._create_tm(tm_path)

    def _create_tm(self, tm_path):
        """The TM should be a utf-8 text file with each line
        containing a source sentence and its translation, separated
        by the `self.tm_delimiter`. A TM size of 200k-250k pairs should
        provide enough matches and good performance, but this may
        depend on overall system specs (RAM, CPU)
        """

        src_segments, tgt_segments = list(), list()
        with open(tm_path, mode="r", encoding="utf-8") as file:
            pairs = file.readlines()
            for pair in pairs:
                source, target = map(str, pair.split(self.tm_delimiter))

                # Filter out very short or very long sentences
                # from the TM for better performance
                if (
                    len(source) < self.tm_unit_min_length
                    or len(source) > self.tm_unit_max_length
                ):
                    continue
                src_segments.append(source.strip())
                tgt_segments.append(target.strip())
        logger.debug(
            f"Translation Memory size for FuzzyMatch transform: " f"{len(src_segments)}"
        )
        return [src_segments, tgt_segments]

    def _get_batch_matches(self, batch):
        logger.debug(f"Starting fuzzy matching on {len(batch)} examples")
        fuzzy_count = 0
        start = time.time()
        augmented = list()

        # We split the `batch` and perform fuzzy matching
        # in smaller chunks of 10.000 examples in order to
        # reduce memory usage.
        # Perfomance is not affected.
        chunk_size = 10000
        mini_batches = np.array_split(
            batch, len(batch) // chunk_size if len(batch) > chunk_size else 1
        )
        for mini_batch in mini_batches:
            plist = list(mini_batch)
            if fuzzy_count >= len(batch) * self.corpus_ratio:
                augmented.extend(plist)
                continue

            results = process.cdist(
                plist,
                self.internal_tm[0],
                scorer=fuzz.ratio,
                dtype=np.uint8,
                score_cutoff=self.threshold,
                workers=-1,
            )

            matches = np.any(results, 1)
            argmax = np.argmax(results, axis=1)
            for idx, s in enumerate(plist):
                # Probably redundant but let's be safe
                # in case some examples are already fuzzied
                # (e.g. from another pipeline or workflow)
                if self.fuzzy_token in s:
                    continue
                # We don't want exact matches
                if matches[idx] and results[idx][argmax[idx]] < 100:
                    if fuzzy_count >= len(batch) * self.corpus_ratio:
                        break
                    plist[idx] = s + self.fuzzy_token + self.internal_tm[1][argmax[idx]]
                    fuzzy_count += 1
            augmented.extend(plist)

        end = time.time()
        logger.debug(
            f"FuzzyMatch Transform: Added {fuzzy_count} " f"fuzzies in {end-start} secs"
        )

        return augmented


@register_transform(name="fuzzymatch")
class FuzzyMatchTransform(Transform):
    """Perform fuzzy matching against a translation memory and
    augment source examples with target matches for Neural Fuzzy Repair.
    :cite:`bulte-tezcan-2019-neural`
    """

    def __init__(self, opts):
        super().__init__(opts)

    @classmethod
    def add_options(cls, parser):
        """Options for fuzzy matching."""

        group = parser.add_argument_group("Transform/FuzzyMatching")
        group.add("--tm_path", "-tm_path", type=str, help="Path to a flat text TM.")
        group.add(
            "--fuzzy_corpus_ratio",
            "-fuzzy_corpus_ratio",
            type=float,
            default=0.1,
            help="Ratio of corpus to augment with fuzzy matches.",
        )
        group.add(
            "--fuzzy_threshold",
            "-fuzzy_threshold",
            type=int,
            default=70,
            help="The fuzzy matching threshold.",
        )
        group.add(
            "--tm_delimiter",
            "-tm_delimiter",
            type=str,
            default="\t",
            help="The delimiter used in the flat text TM.",
        )
        group.add(
            "--fuzzy_token",
            "-fuzzy_token",
            type=str,
            default="｟fuzzy｠",
            help="The fuzzy token to be added with the matches.",
        )
        group.add(
            "--fuzzymatch_min_length",
            "-fuzzymatch_min_length",
            type=int,
            default=4,
            help="Min length for TM entries and examples to match.",
        )
        group.add(
            "--fuzzymatch_max_length",
            "-fuzzymatch_max_length",
            type=int,
            default=70,
            help="Max length for TM entries and examples to match.",
        )

    def _parse_opts(self):
        self.tm_path = self.opts.tm_path
        self.fuzzy_corpus_ratio = self.opts.fuzzy_corpus_ratio
        self.fuzzy_threshold = self.opts.fuzzy_threshold
        self.tm_delimiter = self.opts.tm_delimiter
        self.fuzzy_token = self.opts.fuzzy_token
        self.fuzzymatch_min_length = self.opts.fuzzymatch_min_length
        self.fuzzymatch_max_length = self.opts.fuzzymatch_max_length

    @classmethod
    def get_specials(cls, opts):
        """Add the fuzzy match token to the src vocab."""

        return ([opts.fuzzy_token], list())

    def warm_up(self, vocabs=None):
        """Create the fuzzy matcher."""

        super().warm_up(None)
        self.matcher = FuzzyMatcher(
            self.tm_path,
            self.fuzzy_corpus_ratio,
            self.fuzzy_threshold,
            self.tm_delimiter,
            self.fuzzy_token,
            self.fuzzymatch_min_length,
            self.fuzzymatch_max_length,
        )

    def apply(self, example, is_train=False, stats=None, **kwargs):
        return example

    def batch_apply(self, batch, is_train=False, stats=None, **kwargs):
        src_segments = list()
        for ex, _, _ in batch:
            # Apply a basic filtering to leave out very short or very long
            # sentences and speed up things a bit during fuzzy matching
            if (
                len(" ".join(ex["src"])) > self.fuzzymatch_min_length
                and len(" ".join(ex["src"])) < self.fuzzymatch_max_length
            ):
                src_segments.append(" ".join(ex["src"]))
            else:
                src_segments.append("")
        fuzzied_src = self.matcher._get_batch_matches(src_segments)
        assert len(src_segments) == len(fuzzied_src)
        for idx, (example, _, _) in enumerate(batch):
            if fuzzied_src[idx] != "":
                example["src"] = fuzzied_src[idx].split()

        return batch