Spaces:
Sleeping
Sleeping
from nltk.tokenize import RegexpTokenizer | |
from nltk.stem import PorterStemmer | |
class Tokenizer: | |
def __init__( | |
self, | |
lowercase: bool = True, | |
multiword_expressions: list[str] = None, | |
stemming: bool = False, | |
) -> None: | |
""" | |
A generic class for objects that turn strings into sequences of tokens. | |
A tokenizer can support different preprocessing options or use different methods | |
for determining word breaks. | |
Args: | |
lowercase: Whether to lowercase all the tokens | |
multiword_expressions: A list of strings that should be recognized as single tokens | |
If set to 'None' no multi-word expression matching is performed. | |
No need to perform/implement multi-word expression recognition for HW3. | |
""" | |
# TODO: Save arguments that are needed as fields of this class | |
self.lowercase = lowercase | |
self.multiword_expressions = multiword_expressions | |
self.stemming = stemming | |
def find_and_replace_mwes(self, input_tokens: list[str]) -> list[str]: | |
""" | |
IGNORE THIS PART; NO NEED TO IMPLEMENT THIS SINCE NO MULTI-WORD EXPRESSION PROCESSING IS TO BE USED. | |
For the given sequence of tokens, finds any recognized multi-word expressions in the sequence | |
and replaces that subsequence with a single token containing the multi-word expression. | |
Args: | |
input_tokens: A list of tokens | |
Returns: | |
A list of tokens containing processed multi-word expressions | |
""" | |
# NOTE: You shouldn't implement this in homework | |
raise NotImplemented("MWE is not supported") | |
def postprocess(self, input_tokens: list[str]) -> list[str]: | |
""" | |
Performs any set of optional operations to modify the tokenized list of words such as | |
lower-casing and stemming and returns the modified list of tokens. | |
Args: | |
input_tokens: A list of tokens | |
Returns: | |
A list of tokens processed by lower-casing and stemming depending on the given condition | |
""" | |
# TODO: Add support for lower-casing | |
if self.lowercase: | |
input_tokens = [token.lower() for token in input_tokens] | |
if self.stemming: | |
ps = PorterStemmer() | |
input_tokens = [ps.stem(token) for token in input_tokens] | |
return input_tokens | |
def tokenize(self, text: str) -> list[str]: | |
""" | |
Splits a string into a list of tokens and performs all required postprocessing steps. | |
Args: | |
text: An input text you want to tokenize | |
Returns: | |
A list of tokens | |
""" | |
raise NotImplementedError( | |
"tokenize() is not implemented in the base class; please use a subclass" | |
) | |
class RegexTokenizer(Tokenizer): | |
def __init__( | |
self, | |
token_regex: str, | |
lowercase: bool = True, | |
multiword_expressions: list[str] = None, | |
stemming: bool = False, | |
) -> None: | |
""" | |
Uses NLTK's RegexpTokenizer to tokenize a given string. | |
Args: | |
token_regex: Use the following default regular expression pattern: '\\w+' | |
lowercase: Whether to lowercase all the tokens | |
multiword_expressions: A list of strings that should be recognized as single tokens | |
If set to 'None' no multi-word expression matching is performed. | |
No need to perform/implement multi-word expression recognition for HW3; you can ignore this. | |
""" | |
super().__init__(lowercase, multiword_expressions, stemming) | |
# TODO: Save a new argument that is needed as a field of this class | |
# TODO: Initialize the NLTK's RegexpTokenizer | |
self.tokenizer = RegexpTokenizer(token_regex) | |
def tokenize(self, text: str) -> list[str]: | |
"""Uses NLTK's RegexTokenizer and a regular expression pattern to tokenize a string. | |
Args: | |
text: An input text you want to tokenize | |
Returns: | |
A list of tokens | |
""" | |
# TODO: Tokenize the given text and perform postprocessing on the list of tokens | |
# using the postprocess function | |
words = self.tokenizer.tokenize(text) | |
return self.postprocess(words) | |