from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaConfig , RobertaTokenizer,RobertaForMaskedLM, DataCollatorForLanguageModeling, LineByLineTextDataset, Trainer, TrainingArguments
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from torchinfo import summary
import os
paths = [str(x) for x in Path(".").glob("**/el_*.txt")]
print(paths)
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2,
special_tokens=[
"",
"",
"",
"",
"",
])
dir_path = os.getcwd()
token_dir = os.path.join(dir_path, 'QuijoBERT')
if not os.path.exists(token_dir):
os.makedirs(token_dir)
tokenizer.save_model('QuijoBERT')
tokenizer = ByteLevelBPETokenizer(
"./QuijoBERT/vocab.json",
"./QuijoBERT/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
("", tokenizer.token_to_id("")),
("", tokenizer.token_to_id("")),
)
tokenizer.enable_truncation(max_length=512)
config = RobertaConfig(
vocab_size=52_000,
max_position_embeddings=514,
num_attention_heads=12,
num_hidden_layers=6,
type_vocab_size=1,
)
"""# Step 8: Re-creating the Tokenizer in Transformers"""
tokenizer = RobertaTokenizer.from_pretrained("./QuijoBERT", max_length=512)
#Initializing a Model
model = RobertaForMaskedLM(config=config)
#In case we want to recover the after a crash
#model = RobertaForMaskedLM.from_pretrained("./QuijoBERT/Checkpoint-xxxxx")
#Tensorflow
print(model)
#Pytorch
summary(model)
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="./el_quijote.txt",
block_size=128,
)
#Defining a Data Collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
# Initializing the Trainer Object
training_args = TrainingArguments(
output_dir="./QuijoBERT",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=64,
save_steps=1000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
#Training the Model
print('aqui')
trainer.train()
trainer.save_model("./QuijoBERT")
#Saving the Final Model(+tokenizer + config) to disk
trainer.save_model("./QuijoBERT")