The primary codes below are based on [akpe12/JP-KR-ocr-translator-for-travel](https://github.com/akpe12/JP-KR-ocr-translator-for-travel).

## Import

In [None]:

from typing import Dict, List
import csv
import torch
from transformers import (
    EncoderDecoderModel,
    GPT2Tokenizer as BaseGPT2Tokenizer,
    PreTrainedTokenizer, BertTokenizerFast,
    PreTrainedTokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    AutoTokenizer,
    XLMRobertaTokenizerFast,
    BertJapaneseTokenizer,
    Trainer
)
from torch.utils.data import DataLoader
from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel

# encoder_model_name = "xlm-roberta-base"
encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
device, torch.cuda.device_count()

In [None]:
class GPT2Tokenizer(PreTrainedTokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids: List[int]) -> List[int]:
        return token_ids + [self.eos_token_id]        

src_tokenizer = BertJapaneseTokenizer.from_pretrained(encoder_model_name)
trg_tokenizer = GPT2Tokenizer.from_pretrained(decoder_model_name, bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

## Data

In [None]:
class PairedDataset:
    def __init__(self, 
        src_tokenizer: PreTrainedTokenizerFast, tgt_tokenizer: PreTrainedTokenizerFast,
        file_path: str
    ):
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = tgt_tokenizer
        with open(file_path, 'r') as fd:
            reader = csv.reader(fd)
            next(reader)
            self.data = [row for row in reader]

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
#         with open('train_log.txt', 'a+') as log_file:
#             log_file.write(f'reading data[{index}] {self.data[index]}\n')
        src, trg = self.data[index]
        embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)
        embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])

        return embeddings

    def __len__(self):
        return len(self.data)
    
DATA_ROOT = './output'
FILE_FFAC_FULL = 'ffac_full.csv'
FILE_FFAC_TEST = 'ffac_test.csv'
FILE_JA_KO_TRAIN = 'ja_ko_train.csv'
FILE_JA_KO_TEST = 'ja_ko_test.csv'

# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_FULL}')
# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_FFAC_TEST}') 
train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')
eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, f'{DATA_ROOT}/{FILE_JA_KO_TEST}')        

In [None]:
# be sure to check the column count of each dataset if you encounter "ValueError: too many values to unpack (expected 2)"
# at the `src, trg = self.data[index]`
# The `cat ffac_full.csv tteb_train.csv > ja_ko_train.csv` command may be the reason.
# the last row of first csv and first row of second csv is merged and that's why 3rd column is created (which arouse ValueError)
# debug_data = train_dataset.data


## Model

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model_name,
    decoder_model_name,
    pad_token_id=trg_tokenizer.bos_token_id,
)
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id

In [None]:
# for Trainer
import wandb

collate_fn = DataCollatorForSeq2Seq(src_tokenizer, model)
wandb.init(project="fftr-poc1", name='jbert+kogpt2')

arguments = Seq2SeqTrainingArguments(
    output_dir='dump',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    # num_train_epochs=25,
    per_device_train_batch_size=30,
    # per_device_train_batch_size=64,
    per_device_eval_batch_size=30,
    # per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    gradient_accumulation_steps=4,
    save_total_limit=5,
    dataloader_num_workers=1,
    fp16=True,
    load_best_model_at_end=True,
    report_to='wandb'
)

trainer = Trainer(
    model,
    arguments,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

## Training

In [None]:
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-base",  "skt/kogpt2-base-v2")

In [None]:
trainer.train()

model.save_pretrained("dump/best_model")
src_tokenizer.save_pretrained("dump/best_model/src_tokenizer")
trg_tokenizer.save_pretrained("dump/best_model/trg_tokenizer")