# **Google Drive mounting**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Data Processing**

In [None]:
import os
from bs4 import BeautifulSoup
data_folder = '/content/drive/My Drive/Data/HUISGENOOT/'
corpus = ""
number_files = 0  # Initialize number_files to 0

for filename in os.listdir(data_folder):
    if filename.endswith(".html"):
        number_files += 1
        if number_files > 1000 and number_files <= 1800:  # documents from 1001 to 1800
            with open(os.path.join(data_folder, filename), 'r', encoding='utf-8') as file:
                html_content = file.read()
                soup = BeautifulSoup(html_content, 'html.parser')
                text = soup.get_text()
                corpus += text + "\n"

        if number_files == 1800:  # Stop at 1000 documents
            break

# Save the processed text to a file
with open('/content/drive/My Drive/Data/HUISGENOOT_corpus_1001_1800.txt', 'w', encoding='utf-8') as file:
    file.write(corpus)

# Data Cleaning

In [None]:
import re

def clean_text(text):
    # Remove newlines, extra spaces, and non-breaking spaces
    text = text.replace('\n', ' ').replace('\xa0', ' ').strip()

    # Remove special characters and numbers using regex
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert multiple spaces into single space
    text = re.sub(r'\s+', ' ', text)

    return text.lower()  # Convert text to lowercase

# Apply cleaning function to your corpus
cleaned_corpus = clean_text(corpus)

# Data Tokenization

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import nltk
nltk.data.path.append("'/content/drive/My Drive/Data/Stopw")

In [None]:
# Define Afrikaans stop words
afrikaans_stop_words = set(stopwords.words('dutch'))

# Tokenize text
tokens = word_tokenize(cleaned_corpus)

# Remove stop words
filtered_tokens = [word for word in tokens if word.lower() not in afrikaans_stop_words]

# Join tokens back into a single string
filtered_text = ' '.join(filtered_tokens)

# Save the preprocessed text to a file
with open('/content/drive/My Drive/Data/HUISGENOOT_corpus_preprocessed_V3.txt', 'w', encoding='utf-8') as file:
    file.write(filtered_text)


# Model Training

In [None]:
!pip install transformers
!pip install accelerate>=0.21.0

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize the dataset
train_path = '/content/drive/My Drive/Data/HUISGENOOT_corpus_1001_1800.txt'
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128
)

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-afrikaans",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Start training
trainer.train()




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step,Training Loss
500,4.2638
1000,3.6456
1500,3.4589
2000,3.2708
2500,3.1589
3000,3.0653
3500,2.9625
4000,2.9375
4500,2.857
5000,2.8093


TrainOutput(global_step=8427, training_loss=3.0249779586819154, metrics={'train_runtime': 1550.8083, 'train_samples_per_second': 21.736, 'train_steps_per_second': 5.434, 'total_flos': 2201907953664000.0, 'train_loss': 3.0249779586819154, 'epoch': 3.0})

# Saving

In [None]:
# Save the trained model
output_dir = "./gpt2-afrikaans"
trainer.save_model(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

('./gpt2-afrikaans/tokenizer_config.json',
 './gpt2-afrikaans/special_tokens_map.json',
 './gpt2-afrikaans/vocab.json',
 './gpt2-afrikaans/merges.txt',
 './gpt2-afrikaans/added_tokens.json')