# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
from fairseq.data.encoders import register_tokenizer | |
from fairseq.dataclass import FairseqDataclass | |
class NLTKTokenizer(object): | |
def __init__(self, *unused): | |
try: | |
from nltk.tokenize import word_tokenize | |
self.word_tokenize = word_tokenize | |
except ImportError: | |
raise ImportError("Please install nltk with: pip install nltk") | |
def encode(self, x: str) -> str: | |
return " ".join(self.word_tokenize(x)) | |
def decode(self, x: str) -> str: | |
return x | |