|
"""Module for tokenization utilities""" |
|
|
|
|
|
import logging |
|
|
|
from termcolor import colored |
|
|
|
|
|
def check_dataset_labels(dataset, tokenizer): |
|
|
|
for idx in range(5): |
|
check_example_labels(dataset[idx], tokenizer) |
|
|
|
|
|
def check_example_labels(example, tokenizer): |
|
|
|
input_ids = example["input_ids"] |
|
labels = example["labels"] |
|
attention_mask = example["attention_mask"] |
|
|
|
|
|
|
|
colored_tokens = [] |
|
for _, (input_id, label_id, mask) in enumerate( |
|
zip(input_ids, labels, attention_mask) |
|
): |
|
decoded_input_token = tokenizer.decode(input_id) |
|
|
|
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green") |
|
colored_token = colored(decoded_input_token, color) + colored( |
|
f"({label_id}, {mask}, {input_id})", "white" |
|
) |
|
colored_tokens.append(colored_token) |
|
|
|
logging.info(" ".join(colored_tokens)) |
|
logging.info("\n\n\n") |
|
|