File size: 471 Bytes
154ca7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Parameters preparation.
MAX_SENT_LENGTH = 128
PAD_TOKEN_ID = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

def normalize_v2(text, entity):
    text = text.lower()
    entity = entity.lower()
    if entity not in text:
        return text
    text = text.replace(entity, tokenizer.mask_token) # TODO: not sure if this will be decoded by BERT.
    return text