from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Parameters preparation. MAX_SENT_LENGTH = 128 PAD_TOKEN_ID = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) def normalize_v2(text, entity): text = text.lower() entity = entity.lower() if entity not in text: return text text = text.replace(entity, tokenizer.mask_token) # TODO: not sure if this will be decoded by BERT. return text