|
|
|
|
|
|
|
|
|
from transformers import Trainer, TrainingArguments |
|
from transformers import DataCollatorForLanguageModeling |
|
from transformers import LineByLineTextDataset |
|
from transformers import BertForMaskedLM |
|
from transformers import BertTokenizerFast, BertTokenizer |
|
from transformers import BertConfig |
|
import torch |
|
import time |
|
print(torch.cuda.is_available()) |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
config = BertConfig( |
|
vocab_size=25000, |
|
max_position_embeddings=512, |
|
num_attention_heads=12, |
|
num_hidden_layers=12, |
|
type_vocab_size=2, |
|
) |
|
|
|
|
|
tokenizer = BertTokenizerFast.from_pretrained( |
|
"latin_WP_tokenizer", |
|
) |
|
|
|
|
|
|
|
|
|
model = BertForMaskedLM(config=config) |
|
|
|
print(f"There are {model.num_parameters()} parameters") |
|
|
|
|
|
full_corpus_file = "03_full_latin_corpus_for_training.txt" |
|
|
|
|
|
dataset = LineByLineTextDataset( |
|
tokenizer=tokenizer, |
|
file_path=full_corpus_file, |
|
block_size=128, |
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, mlm=True, mlm_probability=0.15 |
|
) |
|
|
|
|
|
|
|
|
|
output_dir = "./Latin_BERT_training_2" |
|
|
|
training_args = TrainingArguments( |
|
output_dir=output_dir, |
|
overwrite_output_dir=True, |
|
|
|
num_train_epochs=1, |
|
per_device_train_batch_size=64, |
|
save_steps=10000, |
|
save_total_limit=2, |
|
prediction_loss_only=True, |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=dataset, |
|
) |
|
|
|
|
|
|
|
trainer.train() |
|
|
|
trainer.save_model("./latin_BERT_2") |
|
|
|
|
|
|
|
end_time = time.time() |
|
|
|
elapsed_time = end_time - start_time |
|
|
|
print(f"Elapsed time: {elapsed_time} seconds") |
|
|