|
from datasets import load_dataset |
|
from transformers import AutoTokenizer |
|
|
|
def get_training_corpus(dataset): |
|
""" |
|
Returns the training corpus for the given dataset. |
|
""" |
|
return (element['original_ja'] for element in iter(dataset)) |
|
|
|
dataset = load_dataset("snow_simplified_japanese_corpus", streaming=True, split="train") |
|
|
|
train_dataset = dataset.skip(100) |
|
val_dataset = dataset.take(100) |
|
|
|
old_tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-small") |
|
old_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum") |
|
|
|
print("Old Tokenizer:", old_tokenizer.tokenize("誰が一番に着くか私には分かりません。")) |
|
new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(train_dataset), 52000) |
|
|
|
print("New Tokenizer:",new_tokenizer.tokenize("誰が一番に着くか私には分かりません。")) |
|
new_tokenizer.save_pretrained("japanese-dummy-tokenizer") |