lilingxi01's picture
[General] Initial commit
154ca7b
raw
history blame
471 Bytes
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Parameters preparation.
MAX_SENT_LENGTH = 128
PAD_TOKEN_ID = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
def normalize_v2(text, entity):
text = text.lower()
entity = entity.lower()
if entity not in text:
return text
text = text.replace(entity, tokenizer.mask_token) # TODO: not sure if this will be decoded by BERT.
return text