Spaces:

Oopstom
/

ReactSeq

Sleeping

File size: 11,710 Bytes

c668e80

import torch
import pyonmttok
from onmt.constants import DefaultTokens, CorpusTask, ModelTask
from torch.nn.utils.rnn import pad_sequence
from onmt.utils.logging import logger
from collections import Counter


def parse_features(line, n_feats=0, defaults=None):
    """
    Parses text lines with features appended to each token.
    Ex.: This￨A￨B is￨A￨A a￨C￨A test￨A￨B
    """
    text, feats = [], [[] for _ in range(n_feats)]
    check, count = 0, 0
    for token in line.split(" "):
        tok, *fts = token.strip().split("￨")
        check += len(fts)
        count += 1
        if not fts and defaults is not None:
            if isinstance(defaults, str):
                defaults = defaults.split("￨")
            if n_feats > 0:
                assert len(defaults) == n_feats  # Security check
                fts = defaults
        assert len(fts) == n_feats, (
            f"The number of fetures does not match the "
            f"expected number of features. Found {len(fts)} "
            f"features in the data but {n_feats} were expected."
        )
        text.append(tok)
        for i in range(n_feats):
            feats[i].append(fts[i])
    # Check if all tokens have features or none at all
    assert (
        check == 0 or check == count * n_feats
    ), "Some tokens are missing features. Please check your data."
    feats = [" ".join(x) for x in feats] if n_feats > 0 else None
    return " ".join(text), feats


def append_features_to_text(text, features):
    """
    It appends features to subwords when dumping to file
    """
    text_tok = text.split(" ")
    feats_tok = [x.split(" ") for x in features]

    pretty_toks = []
    for tok, *feats in zip(text_tok, *feats_tok):
        feats = "￨".join(feats)
        if feats:
            pretty_toks.append(f"{tok}￨{feats}")
        else:
            pretty_toks.append(tok)
    return " ".join(pretty_toks)


def text_sort_key(ex):
    """Sort using the number of tokens in the sequence."""
    if ex["tgt"]:
        return len(ex["src"]["src_ids"]), len(ex["tgt"]["tgt_ids"])
    return len(ex["src"]["src_ids"])


def clean_example(maybe_example):
    maybe_example["src"] = {"src": " ".join(maybe_example["src"])}
    # Make features part of src like
    # {'src': {'src': ..., 'feats': [...., ....]}}
    if "src_feats" in maybe_example:
        maybe_example["src"]["feats"] = [
            " ".join(x) for x in maybe_example["src_feats"]
        ]
        del maybe_example["src_feats"]
    if maybe_example["tgt"] is not None:
        maybe_example["tgt"] = {"tgt": " ".join(maybe_example["tgt"])}
    if "align" in maybe_example:
        maybe_example["align"] = " ".join(maybe_example["align"])
    return maybe_example


def process(task, bucket, **kwargs):
    """Returns valid transformed bucket from bucket."""
    transform_cid_to_examples = {}
    for example in bucket:
        transform_cid = (example[1], example[2])
        if transform_cid not in transform_cid_to_examples:
            transform_cid_to_examples[transform_cid] = []
        transform_cid_to_examples[transform_cid].append(example)

    processed_bucket = []
    for (transform, cid), sub_bucket in transform_cid_to_examples.items():
        transf_bucket = transform.batch_apply(
            sub_bucket, is_train=(task == CorpusTask.TRAIN), corpus_name=cid
        )
        for example, transform, cid in transf_bucket:
            example = clean_example(example)
            if len(example["src"]["src"]) > 0:
                processed_bucket.append(example)

        # at this point an example looks like:
        # {'src': {'src': ..., 'feats': [....]},
        #  'tgt': {'tgt': ...},
        #  'src_original': ['tok1', ...'tokn'],
        #  'tgt_original': ['tok1', ...'tokm'],
        #  'indices' : seq in bucket
        #  'align': ...,
        # }
    if len(processed_bucket) > 0:
        return processed_bucket
    else:
        return None


def numericalize(vocabs, example):
    """ """
    decoder_start_token = vocabs["decoder_start_token"]
    numeric = example
    numeric["src"]["src_ids"] = []
    if vocabs["data_task"] == ModelTask.SEQ2SEQ:
        src_text = example["src"]["src"].split()
        numeric["src"]["src_ids"] = vocabs["src"](src_text)
        if example["tgt"] is not None:
            numeric["tgt"]["tgt_ids"] = []
            tgt_text = example["tgt"]["tgt"].split()
            numeric["tgt"]["tgt_ids"] = vocabs["tgt"](
                [decoder_start_token] + tgt_text + [DefaultTokens.EOS]
            )

    elif vocabs["data_task"] == ModelTask.LANGUAGE_MODEL:
        src_text = example["src"]["src"].split()
        if decoder_start_token != "":
            src_text = [decoder_start_token] + src_text
        numeric["src"]["src_ids"] = vocabs["src"](src_text)
        if example["tgt"] is not None:
            numeric["tgt"]["tgt_ids"] = []
            tgt_text = example["tgt"]["tgt"].split()
            numeric["tgt"]["tgt_ids"] = vocabs["tgt"](tgt_text + [DefaultTokens.EOS])
    else:
        raise ValueError(f"Something went wrong with task {vocabs['data_task']}")

    if "feats" in example["src"]:
        numeric_feats = []
        for fv, feat in zip(vocabs["src_feats"], example["src"]["feats"]):
            numeric_feats.append(fv(feat.split()))
        numeric["src"]["feats"] = numeric_feats

    return numeric


def parse_align_idx(align_pharaoh):
    """
    Parse Pharaoh alignment into [[<src>, <tgt>], ...]
    """
    align_list = align_pharaoh.strip().split(" ")
    flatten_align_idx = []
    for align in align_list:
        try:
            src_idx, tgt_idx = align.split("-")
        except ValueError:
            logger.warning("{} in `{}`".format(align, align_pharaoh))
            logger.warning("Bad alignement line exists. Please check file!")
            raise
        flatten_align_idx.append([int(src_idx), int(tgt_idx)])
    return flatten_align_idx


def tensorify(vocabs, minibatch):
    """
    This function transforms a batch of example in tensors
    Each example looks like
    {'src': {'src': ..., 'feats': [...], 'src_ids': ...},
     'tgt': {'tgt': ..., 'tgt_ids': ...},
     'src_original': ['tok1', ...'tokn'],
     'tgt_original': ['tok1', ...'tokm'],
     'indices' : seq in bucket
     'align': ...,
    }
    Returns  Dict of batch Tensors
        {'src': [seqlen, batchsize, n_feats+1],
         'tgt' : [seqlen, batchsize, n_feats=1],
         'indices' : [batchsize],
         'srclen': [batchsize],
         'tgtlen': [batchsize],
         'align': alignment sparse tensor
        }
    """
    tensor_batch = {}
    tbatchsrc = [torch.LongTensor(ex["src"]["src_ids"]) for ex in minibatch]
    padidx = vocabs["src"][DefaultTokens.PAD]
    tbatchsrc = pad_sequence(tbatchsrc, batch_first=True, padding_value=padidx)
    if "feats" in minibatch[0]["src"]:
        tbatchfs = [tbatchsrc]
        for feat_id in range(len(minibatch[0]["src"]["feats"])):
            tbatchfeat = [
                torch.LongTensor(ex["src"]["feats"][feat_id]) for ex in minibatch
            ]
            padidx = vocabs["src_feats"][feat_id][DefaultTokens.PAD]
            tbatchfeat = pad_sequence(
                tbatchfeat, batch_first=True, padding_value=padidx
            )
            tbatchfs.append(tbatchfeat)
        tbatchsrc = torch.stack(tbatchfs, dim=2)
    else:
        # Need to add features in last dimensions
        tbatchsrc = tbatchsrc[:, :, None]

    tensor_batch["src"] = tbatchsrc
    tensor_batch["indices"] = torch.LongTensor([ex["indices"] for ex in minibatch])
    tensor_batch["srclen"] = torch.LongTensor(
        [len(ex["src"]["src_ids"]) for ex in minibatch]
    )

    if minibatch[0]["tgt"] is not None:
        tbatchtgt = [torch.LongTensor(ex["tgt"]["tgt_ids"]) for ex in minibatch]
        padidx = vocabs["tgt"][DefaultTokens.PAD]
        tbatchtgt = pad_sequence(tbatchtgt, batch_first=True, padding_value=padidx)
        tbatchtgt = tbatchtgt[:, :, None]
        tbatchtgtlen = torch.LongTensor([len(ex["tgt"]["tgt_ids"]) for ex in minibatch])
        tensor_batch["tgt"] = tbatchtgt
        tensor_batch["tgtlen"] = tbatchtgtlen

    if "align" in minibatch[0].keys() and minibatch[0]["align"] is not None:
        sparse_idx = []
        for i, ex in enumerate(minibatch):
            for src, tgt in parse_align_idx(ex["align"]):
                sparse_idx.append([i, tgt + 1, src])
        tbatchalign = torch.LongTensor(sparse_idx)
        tensor_batch["align"] = tbatchalign

    if "src_map" in minibatch[0].keys():
        src_vocab_size = max([max(ex["src_map"]) for ex in minibatch]) + 1
        src_map = torch.zeros(
            len(tensor_batch["srclen"]), tbatchsrc.size(1), src_vocab_size
        )
        for i, ex in enumerate(minibatch):
            for j, t in enumerate(ex["src_map"]):
                src_map[i, j, t] = 1
        tensor_batch["src_map"] = src_map

    if "alignment" in minibatch[0].keys():
        alignment = torch.zeros(len(tensor_batch["srclen"]), tbatchtgt.size(1)).long()
        for i, ex in enumerate(minibatch):
            alignment[i, : len(ex["alignment"])] = torch.LongTensor(ex["alignment"])
        tensor_batch["alignment"] = alignment

    if "src_ex_vocab" in minibatch[0].keys():
        tensor_batch["src_ex_vocab"] = [ex["src_ex_vocab"] for ex in minibatch]

    return tensor_batch


def textbatch_to_tensor(vocabs, batch, is_train=False):
    """
    This is a hack to transform a simple batch of texts
    into a tensored batch to pass through _translate()
    """
    numeric = []
    infer_iter = []
    for i, ex in enumerate(batch):
        # Keep it consistent with dynamic data
        ex["srclen"] = len(ex["src"]["src"].split())
        ex["indices"] = i
        ex["align"] = None
        numeric.append(numericalize(vocabs, ex))
    numeric.sort(key=text_sort_key, reverse=True)
    infer_iter = [tensorify(vocabs, numeric)]
    return infer_iter


def _addcopykeys(vocabs, example):
    """Create copy-vocab and numericalize with it.
    In-place adds ``"src_map"`` to ``example``. That is the copy-vocab
    numericalization of the tokenized ``example["src"]``. If ``example``
    has a ``"tgt"`` key, adds ``"alignment"`` to example. That is the
    copy-vocab numericalization of the tokenized ``example["tgt"]``. The
    alignment has an initial and final UNK token to match the BOS and EOS
    tokens.
    Args:
        vocabs
        example (dict): An example dictionary with a ``"src"`` key and
            maybe a ``"tgt"`` key. (This argument changes in place!)
    Returns:
        ``example``, changed as described.
    """
    src = example["src"]["src"].split()
    src_ex_vocab = pyonmttok.build_vocab_from_tokens(
        Counter(src),
        maximum_size=0,
        minimum_frequency=1,
        special_tokens=[
            DefaultTokens.UNK,
            DefaultTokens.PAD,
            DefaultTokens.BOS,
            DefaultTokens.EOS,
        ],
    )
    src_ex_vocab.default_id = src_ex_vocab[DefaultTokens.UNK]
    # make a small vocab containing just the tokens in the source sequence

    # Map source tokens to indices in the dynamic dict.
    example["src_map"] = src_ex_vocab(src)
    example["src_ex_vocab"] = src_ex_vocab

    if example["tgt"] is not None:
        if vocabs["data_task"] == ModelTask.SEQ2SEQ:
            tgt = (
                [DefaultTokens.UNK]
                + example["tgt"]["tgt"].split()
                + [DefaultTokens.UNK]
            )
        elif vocabs["data_task"] == ModelTask.LANGUAGE_MODEL:
            tgt = example["tgt"]["tgt"].split() + [DefaultTokens.UNK]
        example["alignment"] = src_ex_vocab(tgt)
    return example