diff --git "a/a2_Q9_LSTM_Japanes_.ipynb" "b/a2_Q9_LSTM_Japanes_.ipynb" new file mode 100644--- /dev/null +++ "b/a2_Q9_LSTM_Japanes_.ipynb" @@ -0,0 +1,708 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "g_3ZxaoEtJnl", + "outputId": "0445ad26-8249-412d-cd0e-ec5e5c1d82c6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting sacrebleu\n", + " Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/51.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.8/51.8 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting portalocker (from sacrebleu)\n", + " Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (2024.9.11)\n", + "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (0.9.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (1.26.4)\n", + "Collecting colorama (from sacrebleu)\n", + " Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)\n", + "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (5.3.0)\n", + "Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m104.0/104.0 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", + "Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)\n", + "Installing collected packages: portalocker, colorama, sacrebleu\n", + "Successfully installed colorama-0.4.6 portalocker-2.10.1 sacrebleu-2.4.3\n" + ] + } + ], + "source": [ + "!pip install sacrebleu\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.utils.data import DataLoader, Dataset\n", + "import numpy as np\n", + "import sacrebleu\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import csv\n", + "import re\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "import re\n", + "import torch\n", + "\n", + "# Function to clean invalid escape sequences\n", + "def clean_json(file_path):\n", + " with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n", + " content = f.read()\n", + " # Remove invalid \\uXXXX sequences\n", + " content = re.sub(r'\\\\u[0-9A-Fa-f]{0,3}(?![0-9A-Fa-f])', '', content)\n", + " return content\n", + "\n", + "# Load and clean JSON\n", + "file_path = '/content/Japanes.json'\n", + "try:\n", + " cleaned_content = clean_json(file_path)\n", + " data = json.loads(cleaned_content)\n", + " print(\"JSON loaded successfully.\")\n", + "except json.JSONDecodeError as e:\n", + " print(f\"Error decoding JSON: {e}\")\n", + "\n", + "# Extract sentences\n", + "en_sentences = [entry['input'] for entry in data[:1000]]\n", + "ja_sentences = [entry['output'] for entry in data[:1000]] # Lowercase 'bu_sentences'\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sduCxJLLSBv3", + "outputId": "6eac95e8-fc62-458f-cb22-eb9c1f5a54a4" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "JSON loaded successfully.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Hyperparameters\n", + "MAX_LEN = 60\n", + "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "VOCAB_SIZE = 200000\n", + "EMBEDDING_DIM = 256\n", + "HIDDEN_DIM = 512\n", + "BATCH_SIZE = 32\n", + "NUM_EPOCHS = 20\n", + "\n", + "tokenizer = lambda x: x.split()\n", + "\n", + "# Build vocabulary\n", + "def build_vocab(sentences):\n", + " counter = {}\n", + " for sentence in sentences:\n", + " for token in tokenizer(sentence):\n", + " counter[token] = counter.get(token, 0) + 1\n", + " return {word: idx + 4 for idx, (word, _) in enumerate(counter.items())}\n", + "\n", + "en_vocab = {'': 0, '': 1, '': 2, '': 3, **build_vocab(en_sentences)}\n", + "ja_vocab = {'': 0, '': 1, '': 2, '': 3, **build_vocab(ja_sentences)} # Lowercase 'ja_sentences'\n", + "\n", + "# Convert sentences to tensors\n", + "def sentence_to_tensor(sentence, vocab, max_len=MAX_LEN):\n", + " tokens = [vocab.get(word, vocab['']) for word in tokenizer(sentence)]\n", + " return torch.tensor([vocab['']] + tokens + [vocab['']] + [vocab['']] * (max_len - len(tokens) - 2))[:max_len]\n", + "\n", + "en_tensor_sentences = [sentence_to_tensor(s, en_vocab) for s in en_sentences]\n", + "ja_tensor_sentences = [sentence_to_tensor(s, ja_vocab) for s in ja_sentences] # Lowercase 'ja_tensor_sentences'\n", + "\n", + "# Split into training and validation sets\n", + "train_size = int(len(en_tensor_sentences) * 0.8)\n", + "train_en, val_en = en_tensor_sentences[:train_size], en_tensor_sentences[train_size:]\n", + "train_ja, val_ja = ja_tensor_sentences[:train_size], ja_tensor_sentences[train_size:] # Lowercase 'train_bu' and 'val_bu'\n" + ], + "metadata": { + "id": "n4QrGouxtO5h" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def print_sample_data(en_sentences, bu_sentences):\n", + " for i in range(4):\n", + " en_sample, bu_sample = en_sentences[i], bu_sentences[i]\n", + " print(f\"EN to Ja\\n\\n {en_sample} \\n {bu_sample} \\n\")\n", + " print(f\"Ja to EN\\n\\n {bu_sample} \\n {en_sample} \\n\\n\")\n", + "\n", + "# Call the function with English and Japanes sentences\n", + "print_sample_data(en_sentences, bu_sentences)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EUJS1TsltO82", + "outputId": "89e3dd07-5ecb-4894-f728-901f6cbb7779" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "EN to Ja\n", + "\n", + " Give three tips for staying healthy. \n", + " 健康を維持するための3つのヒントを教えてください。 \n", + "\n", + "Ja to EN\n", + "\n", + " 健康を維持するための3つのヒントを教えてください。 \n", + " Give three tips for staying healthy. \n", + "\n", + "\n", + "EN to Ja\n", + "\n", + " 1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n", + "\n", + "2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n", + "\n", + "3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night. \n", + " 1. バランスの取れた栄養価の高い食事を摂る: 食事にはさまざまな果物や野菜、脂肪分の少ないタンパク質、全粒穀物、健康的な脂肪が含まれていることを確認してください。これは体が最高の状態で機能するために必須の栄養素を提供するのに役立ち、慢性疾患の予防に役立ちます。 2. 定期的な身体活動に参加する: 強い骨、筋肉、心臓血管の健康を維持するには、運動が不可欠です。毎週少なくとも 150 分間の中程度の有酸素運動、または 75 分間の激しい運動を目標にしましょう。 3. 十分な睡眠をとる:質の高い睡眠を十分にとることは、身体的および精神的な健康にと���て非常に重要です。気分を調整し、認知機能を改善し、健康な成長と免疫機能をサポートします。毎晩7〜9時間の睡眠を目指しましょう。 \n", + "\n", + "Ja to EN\n", + "\n", + " 1. バランスの取れた栄養価の高い食事を摂る: 食事にはさまざまな果物や野菜、脂肪分の少ないタンパク質、全粒穀物、健康的な脂肪が含まれていることを確認してください。これは体が最高の状態で機能するために必須の栄養素を提供するのに役立ち、慢性疾患の予防に役立ちます。 2. 定期的な身体活動に参加する: 強い骨、筋肉、心臓血管の健康を維持するには、運動が不可欠です。毎週少なくとも 150 分間の中程度の有酸素運動、または 75 分間の激しい運動を目標にしましょう。 3. 十分な睡眠をとる:質の高い睡眠を十分にとることは、身体的および精神的な健康にとって非常に重要です。気分を調整し、認知機能を改善し、健康な成長と免疫機能をサポートします。毎晩7〜9時間の睡眠を目指しましょう。 \n", + " 1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n", + "\n", + "2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n", + "\n", + "3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night. \n", + "\n", + "\n", + "EN to Ja\n", + "\n", + " What are the three primary colors? \n", + " 三原色とは何ですか? \n", + "\n", + "Ja to EN\n", + "\n", + " 三原色とは何ですか? \n", + " What are the three primary colors? \n", + "\n", + "\n", + "EN to Ja\n", + "\n", + " The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB). \n", + " 三原色は赤、青、黄です。これらの色は、他の色を混ぜることによって作成することができず、他の色はすべてさまざまな割合で組み合わせることで作成できるため、原色と呼ばれます。光に使用される加法混色システムでは、原色は赤、緑、青 (RGB) です。 \n", + "\n", + "Ja to EN\n", + "\n", + " 三原色は赤、青、黄です。これらの色は、他の色を混ぜることによって作成することができず、他の色はすべてさまざまな割合で組み合わせることで作成できるため、原色と呼ばれます。光に使用される加法混色システムでは、原色は赤、緑、青 (RGB) です。 \n", + " The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB). \n", + "\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "class TranslationDataset(Dataset):\n", + " def __init__(self, en, bu):\n", + " self.en, self.te = en, bu\n", + " def __len__(self):\n", + " return len(self.en)\n", + " def __getitem__(self, idx):\n", + " return self.en[idx], self.te[idx]\n", + "\n", + "def collate_fn(batch):\n", + " en_batch, te_batch = zip(*batch)\n", + " return torch.stack(en_batch, dim=0), torch.stack(te_batch, dim=0)\n", + "\n", + "train_loader = DataLoader(TranslationDataset(train_en, train_ja), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n", + "val_loader = DataLoader(TranslationDataset(val_en, val_ja), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n" + ], + "metadata": { + "id": "wRoVMiaStO_i" + }, + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# LSTM-based Model for Translation with more layers\n", + "class LSTMTranslationModel(nn.Module):\n", + " def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, num_layers=2):\n", + " super(LSTMTranslationModel, self).__init__()\n", + " self.embedding = nn.Embedding(input_dim, embed_dim)\n", + " self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)\n", + " self.fc = nn.Linear(hidden_dim, output_dim)\n", + "\n", + " def forward(self, x, te_input=None):\n", + " embedded = self.embedding(x)\n", + " lstm_out, (hidden, cell) = self.lstm(embedded)\n", + " output = self.fc(lstm_out)\n", + " return output\n", + "\n", + "# Initialize Model\n", + "model = LSTMTranslationModel(input_dim=VOCAB_SIZE, output_dim=len(ja_vocab), embed_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM).to(DEVICE)\n", + "\n", + "\n", + "def custom_model_summary(model):\n", + " # Extract model details\n", + " model_summary = []\n", + " for name, param in model.named_parameters():\n", + " layer_info = {\n", + " \"Layer Name\": name,\n", + " \"Layer Type\": type(param).__name__,\n", + " \"Output Shape\": list(param.size()),\n", + " \"Parameters\": param.numel()\n", + " }\n", + " model_summary.append(layer_info)\n", + "\n", + " # Display as a DataFrame table\n", + " df_summary = pd.DataFrame(model_summary)\n", + " print(df_summary.to_markdown(index=False))\n", + "\n", + "custom_model_summary(model)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O_OdhdeftPU5", + "outputId": "6fefa9b9-578b-4c43-87c4-3fac5b78e555" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "| Layer Name | Layer Type | Output Shape | Parameters |\n", + "|:------------------|:-------------|:---------------|-------------:|\n", + "| embedding.weight | Parameter | [200000, 256] | 51200000 |\n", + "| lstm.weight_ih_l0 | Parameter | [2048, 256] | 524288 |\n", + "| lstm.weight_hh_l0 | Parameter | [2048, 512] | 1048576 |\n", + "| lstm.bias_ih_l0 | Parameter | [2048] | 2048 |\n", + "| lstm.bias_hh_l0 | Parameter | [2048] | 2048 |\n", + "| lstm.weight_ih_l1 | Parameter | [2048, 512] | 1048576 |\n", + "| lstm.weight_hh_l1 | Parameter | [2048, 512] | 1048576 |\n", + "| lstm.bias_ih_l1 | Parameter | [2048] | 2048 |\n", + "| lstm.bias_hh_l1 | Parameter | [2048] | 2048 |\n", + "| fc.weight | Parameter | [3529, 512] | 1806848 |\n", + "| fc.bias | Parameter | [3529] | 3529 |\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Define Optimizer and Loss Function\n", + "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", + "criterion = nn.CrossEntropyLoss(ignore_index=ja_vocab[''])\n", + "\n", + "def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10):\n", + " train_losses, val_losses = [], []\n", + " for epoch in range(epochs):\n", + " model.train()\n", + " train_loss = 0\n", + " for en_batch, te_batch in train_loader:\n", + " en_batch, te_batch = en_batch.to(DEVICE), te_batch.to(DEVICE)\n", + " te_batch = te_batch.long()\n", + " optimizer.zero_grad()\n", + " output = model(en_batch)\n", + " loss = criterion(output.reshape(-1, output.shape[-1]), te_batch.reshape(-1))\n", + " loss.backward()\n", + " optimizer.step()\n", + " train_loss += loss.item()\n", + " train_losses.append(train_loss / len(train_loader))\n", + "\n", + " model.eval()\n", + " val_loss = 0\n", + " with torch.no_grad():\n", + " for en_batch, te_batch in val_loader:\n", + " en_batch, te_batch = en_batch.to(DEVICE), te_batch.to(DEVICE)\n", + " output = model(en_batch)\n", + " loss = criterion(output.reshape(-1, output.shape[-1]), te_batch.reshape(-1))\n", + " val_loss += loss.item()\n", + " val_losses.append(val_loss / len(val_loader))\n", + " print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_losses[-1]}, Val Loss: {val_losses[-1]}')\n", + "\n", + " return train_losses, val_losses\n", + "\n", + "train_losses, val_losses = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=NUM_EPOCHS)\n", + "\n", + "torch.save(model.state_dict(), 'lstm_translation_model.pth')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d2z7r_1btPXb", + "outputId": "584a871c-7bd2-46a6-d7b4-79ea1476ab1d" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/20, Train Loss: 6.669412956237793, Val Loss: 5.717513118471418\n", + "Epoch 2/20, Train Loss: 5.507481346130371, Val Loss: 5.990183251244681\n", + "Epoch 3/20, Train Loss: 5.257644233703613, Val Loss: 6.248048169272287\n", + "Epoch 4/20, Train Loss: 5.053595180511475, Val Loss: 6.469724587031773\n", + "Epoch 5/20, Train Loss: 4.872624206542969, Val Loss: 6.725061961582729\n", + "Epoch 6/20, Train Loss: 4.583933629989624, Val Loss: 6.803114891052246\n", + "Epoch 7/20, Train Loss: 4.354597215652466, Val Loss: 6.972484316144671\n", + "Epoch 8/20, Train Loss: 4.0424451160430905, Val Loss: 7.086231981004987\n", + "Epoch 9/20, Train Loss: 3.6922006034851074, Val Loss: 7.280972412654331\n", + "Epoch 10/20, Train Loss: 3.3243370628356934, Val Loss: 7.31565032686506\n", + "Epoch 11/20, Train Loss: 2.9697946071624757, Val Loss: 7.536340441022601\n", + "Epoch 12/20, Train Loss: 2.5779794311523436, Val Loss: 7.599312577928815\n", + "Epoch 13/20, Train Loss: 2.2219763135910036, Val Loss: 7.713373456682477\n", + "Epoch 14/20, Train Loss: 1.9110792970657349, Val Loss: 7.858387606484549\n", + "Epoch 15/20, Train Loss: 1.6369098329544067, Val Loss: 7.993720190865653\n", + "Epoch 16/20, Train Loss: 1.4096700048446655, Val Loss: 8.061748845236641\n", + "Epoch 17/20, Train Loss: 1.2724453282356263, Val Loss: 8.120101247514997\n", + "Epoch 18/20, Train Loss: 1.1353174948692322, Val Loss: 8.183435099465507\n", + "Epoch 19/20, Train Loss: 1.039564082622528, Val Loss: 8.27905866077968\n", + "Epoch 20/20, Train Loss: 0.9702791261672974, Val Loss: 8.327078546796526\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Plotting Losses\n", + "plt.figure(figsize=(10, 5))\n", + "plt.plot(range(1, NUM_EPOCHS + 1), train_losses, label='Training Loss')\n", + "plt.plot(range(1, NUM_EPOCHS + 1), val_losses, label='Validation Loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.title('LTSM-Training and Validation Losses')\n", + "plt.legend()\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 487 + }, + "id": "lQZ58WXMtPa5", + "outputId": "93c29042-f1f7-45fd-a1fe-f5e9aada3a72" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "def translate_new_sentence(model, input_sentence, input_vocab, output_vocab, max_len=MAX_LEN):\n", + " \"\"\"\n", + " Translate a new sentence from input language to output language.\n", + " \"\"\"\n", + " model.eval()\n", + " input_tensor = sentence_to_tensor(input_sentence, input_vocab, max_len).unsqueeze(0).to(DEVICE)\n", + "\n", + " # Start with the token as input to the decoder\n", + " output_input = torch.tensor([output_vocab['']]).unsqueeze(0).to(DEVICE)\n", + "\n", + " translated_tokens = []\n", + " with torch.no_grad():\n", + " for _ in range(max_len):\n", + " output = model(input_tensor, output_input)\n", + " output_token = output.argmax(dim=-1)[:, -1].item() # Get the most likely next token\n", + "\n", + " if output_token == output_vocab['']: # End token\n", + " break\n", + "\n", + " translated_tokens.append(output_token)\n", + "\n", + " # Update the input for the next time step\n", + " output_input = torch.cat([output_input, torch.tensor([[output_token]]).to(DEVICE)], dim=1)\n", + "\n", + " # Convert token IDs back to words\n", + " translated_sentence = \" \".join([k for k, v in output_vocab.items() if v in translated_tokens])\n", + "\n", + " # Wrap the output in the desired format\n", + " return translated_sentence.strip()\n", + "\n", + "\n", + "def translate_and_format(model, input_sentence, input_vocab, output_vocab, direction, max_len=MAX_LEN):\n", + " \"\"\"\n", + " Wrapper function to format the translation output.\n", + " \"\"\"\n", + " translated_sentence = translate_new_sentence(model, input_sentence, input_vocab, output_vocab, max_len)\n", + " return f\"\\n<{direction[0]}> {input_sentence.strip()} \\n<{direction[1]}> {translated_sentence} \\n\"\n", + "\n", + "\n", + "# Test translation\n", + "en_example_sentence = \"who are you.\"\n", + "ja_example_sentence = \"あなたは誰ですか.\"\n", + "\n", + "# English to Japanes\n", + "translated_bu_sentence = translate_and_format(model, en_example_sentence, en_vocab, ja_vocab, direction=(\"en\", \"ja\"))\n", + "print(translated_bu_sentence)\n", + "\n", + "# Japanes to English\n", + "translated_en_sentence = translate_and_format(model, ja_example_sentence, ja_vocab, en_vocab, direction=(\"ja\", \"en\"))\n", + "print(translated_en_sentence)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mAmG3B2McooM", + "outputId": "f48ae83b-2cea-46e4-fbf2-96f09e90df3d" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + " who are you. \n", + " × \n", + "\n", + "\n", + " あなたは誰ですか. \n", + " develop. \n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def translate_new_sentence(model, sentence, src_vocab, tgt_vocab, max_len=50):\n", + " model.eval()\n", + " en_indices = [src_vocab.get(word, src_vocab['']) for word in sentence.split()]\n", + " en_tensor = torch.tensor(en_indices).unsqueeze(0).to(DEVICE)\n", + " tgt_tensor = torch.tensor([tgt_vocab['']]).unsqueeze(0).to(DEVICE)\n", + " translated_sentence = []\n", + "\n", + " for _ in range(max_len):\n", + " with torch.no_grad():\n", + " output = model(en_tensor, tgt_tensor)\n", + "\n", + " output_token = output.argmax(dim=-1)[:, -1]\n", + " output_token_item = output_token.item()\n", + " translated_word = list(tgt_vocab.keys())[list(tgt_vocab.values()).index(output_token_item)]\n", + " translated_sentence.append(translated_word)\n", + "\n", + " if translated_word == '':\n", + " break\n", + "\n", + " tgt_tensor = torch.cat((tgt_tensor, output_token.unsqueeze(0)), dim=-1)\n", + "\n", + " return ' '.join(translated_sentence)\n", + "\n", + "def calculate_bleu_chrf(en_sentences, te_sentences, model, en_vocab, te_vocab, max_len=50):\n", + " bleu_scores, chrf_scores = [], []\n", + "\n", + " for en_sentence, te_sentence in zip(en_sentences, bu_sentences):\n", + " translated_sentence = translate_new_sentence(model, en_sentence, en_vocab, bu_vocab, max_len)\n", + "\n", + " # Calculate BLEU Score without smoothing\n", + " bleu_score = sacrebleu.corpus_bleu([translated_sentence], [[te_sentence]]).score\n", + " bleu_scores.append(bleu_score)\n", + "\n", + " # Calculate CHRF Score without smoothing\n", + " chrf_score = sacrebleu.corpus_chrf([translated_sentence], [[te_sentence]]).score\n", + " chrf_scores.append(chrf_score)\n", + "\n", + " return bleu_scores, chrf_scores\n", + "\n", + "lstm_bleu_scores, lstm_chrf_scores = calculate_bleu_chrf(en_sentences, ja_sentences, model, en_vocab, ja_vocab)\n", + "\n", + "# Save BLEU scores to CSV\n", + "with open('Seq2Seq_BLEU_scores11.csv', mode='w', newline='', encoding='utf-8') as file:\n", + " writer = csv.writer(file)\n", + " writer.writerow([\"BLEU Score\"]) # Only BLEU score\n", + " for bleu in lstm_bleu_scores:\n", + " writer.writerow([bleu]) # Just the BLEU score\n", + "\n", + "# Save CHRF scores to CSV\n", + "with open('Seq2Seq_CHRF_scores1.csv', mode='w', newline='', encoding='utf-8') as file:\n", + " writer = csv.writer(file)\n", + " writer.writerow([\"CHRF Score\"]) # Only CHRF score\n", + " for chrf in lstm_chrf_scores:\n", + " writer.writerow([chrf]) # Just the CHRF score\n", + "\n", + "print(\"BLEU and CHRF scores saved to CSV files.\")" + ], + "metadata": { + "id": "uGqew4ATtPhz", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2d59ae82-a965-46a7-fcb7-0483ec1a061b" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "BLEU and CHRF scores saved to CSV files.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# Load the BLEU scores CSV file\n", + "bleu_scores_path = 'Seq2Seq_BLEU_scores11.csv'\n", + "bleu_df = pd.read_csv(bleu_scores_path)\n", + "\n", + "# Load the CHRF scores CSV file\n", + "chrf_scores_path = 'Seq2Seq_CHRF_scores1.csv'\n", + "chrf_df = pd.read_csv(chrf_scores_path)\n", + "\n", + "# Display the first few rows of both files to understand their structure\n", + "bleu_df.head(), chrf_df.head()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "82mCGu0uO6sm", + "outputId": "d92eb7a7-f3ac-49c5-ffc0-b2e8f9735988" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "( BLEU Score\n", + " 0 0.0\n", + " 1 0.0\n", + " 2 0.0\n", + " 3 0.0\n", + " 4 0.0,\n", + " CHRF Score\n", + " 0 0.0\n", + " 1 0.0\n", + " 2 0.0\n", + " 3 0.0\n", + " 4 0.0)" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "\n", + "# Plotting the BLEU and CHRF scores\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(bleu_df['BLEU Score'], label='BLEU Score', marker='o')\n", + "plt.plot(chrf_df['CHRF Score'], label='CHRF Score', marker='x')\n", + "\n", + "# Adding labels and title\n", + "plt.xlabel('Sample Index')\n", + "plt.ylabel('Score')\n", + "plt.title('Comparison of BLEU and CHRF Scores')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "\n", + "# Display the plot\n", + "plt.show()\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 564 + }, + "id": "AEnaH43dOfpJ", + "outputId": "aeb7805e-37c5-45e1-c381-437786249408" + }, + "execution_count": 47, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "XafG0-qKOrA4" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file