{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6a6de8e2", "metadata": { "id": "6a6de8e2" }, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "import string\n", "import re\n", "from unicodedata import normalize\n", "import numpy as np\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.utils import to_categorical\n", "from keras.models import Sequential,load_model\n", "from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed\n", "from keras.callbacks import EarlyStopping\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "from nltk.translate.bleu_score import corpus_bleu\n", "import pandas as pd\n", "from string import punctuation\n", "import matplotlib.pyplot as plt\n", "from IPython.display import Markdown, display\n", "\n", "def printmd(string):\n", " # Print with Markdowns\n", " display(Markdown(string))" ] }, { "cell_type": "code", "execution_count": 2, "id": "cNkcJJtCi_I4", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cNkcJJtCi_I4", "outputId": "76757ad6-0fed-4b84-9bde-1d7991ff10ee" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": 3, "id": "d7439528", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 151 }, "id": "d7439528", "outputId": "da232d2d-0551-4d62-bc4f-119456037d6a" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/markdown": "## 10000 \"parallel sentences\" will be loaded (original sentence + its translation)" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/markdown": "## 9000 \"parallel sentences\" will be used to train the model" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/markdown": "## 1000 \"parallel sentences\" will be used to test the model" }, "metadata": {} } ], "source": [ "total_sentences = 10000\n", "\n", "# Load the dataset\n", "dataset = pd.read_csv(\"/content/drive/MyDrive/Colab Notebooks/Dataset/eng_-french.csv\", nrows = total_sentences)\n", "\n", "# What proportion of the sentences will be used for the test set\n", "test_proportion = 0.1\n", "train_test_threshold = int( (1-test_proportion) * total_sentences)\n", "\n", "printmd(f'## {total_sentences} \"parallel sentences\" will be loaded (original sentence + its translation)')\n", "printmd(f'## {train_test_threshold} \"parallel sentences\" will be used to train the model')\n", "printmd(f'## {total_sentences-train_test_threshold} \"parallel sentences\" will be used to test the model')" ] }, { "cell_type": "code", "execution_count": 4, "id": "5cf29feb", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "id": "5cf29feb", "outputId": "72534a51-013d-4569-8043-d1fbb474675c" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " English words/sentences French words/sentences\n", "1554 Let me die. Laisse-moi mourir.\n", "2087 He's a slob. C'est un flemmard.\n", "5470 I have to try. Il faut que j'essaie.\n", "2363 I was naive. Je fus crédule.\n", "7570 He is bankrupt. Il est en faillite.\n", "6427 That's a fact. C'est un fait.\n", "1651 Talk to me! Parlez-moi !\n", "4164 Keep talking. Continuez de parler.\n", "1231 I broke it. Je l'ai cassée.\n", "9232 Tom is a judge. Tom est juge." ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
English words/sentencesFrench words/sentences
1554Let me die.Laisse-moi mourir.
2087He's a slob.C'est un flemmard.
5470I have to try.Il faut que j'essaie.
2363I was naive.Je fus crédule.
7570He is bankrupt.Il est en faillite.
6427That's a fact.C'est un fait.
1651Talk to me!Parlez-moi !
4164Keep talking.Continuez de parler.
1231I broke it.Je l'ai cassée.
9232Tom is a judge.Tom est juge.
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"dataset\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"English words/sentences\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"I broke it.\",\n \"He's a slob.\",\n \"That's a fact.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"French words/sentences\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Je l'ai cass\\u00e9e.\",\n \"C'est un flemmard.\",\n \"C'est un fait.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 4 } ], "source": [ "# Shuffle the dataset\n", "dataset = dataset.sample(frac=1, random_state=0)\n", "dataset.iloc[1000:1010]" ] }, { "cell_type": "code", "execution_count": 5, "id": "33f574a5", "metadata": { "id": "33f574a5" }, "outputs": [], "source": [ "def clean(string):\n", " # Clean the string\n", " string = string.replace(\"\\u202f\",\" \") # Replace no-break space with space\n", " string = string.lower()\n", "\n", " # Delete the punctuation and the numbers\n", " for p in punctuation + \"«»\" + \"0123456789\":\n", " string = string.replace(p,\" \")\n", "\n", " string = re.sub('\\s+',' ', string)\n", " string = string.strip()\n", "\n", " return string\n", "\n", "# Clean the sentences\n", "dataset[\"English words/sentences\"] = dataset[\"English words/sentences\"].apply(lambda x: clean(x))\n", "dataset[\"French words/sentences\"] = dataset[\"French words/sentences\"].apply(lambda x: clean(x))\n", "\n", "# Select one part of the dataset\n", "dataset = dataset.values\n", "dataset = dataset[:total_sentences]\n", "\n", "# split into train/test\n", "train, test = dataset[:train_test_threshold], dataset[train_test_threshold:]\n", "\n", "# Define the name of the source and of the target\n", "# This will be used in the outputs of this notebook\n", "source_str, target_str = \"French\", \"English\"\n", "\n", "# The index in the numpy array of the source and of the target\n", "idx_src, idx_tar = 1, 0\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "ZdkiZ76oSt34", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "id": "ZdkiZ76oSt34", "outputId": "a3e74a90-561e-48b7-9959-50ee1d697bc0" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 0 1\n", "0 let me die laisse moi mourir\n", "1 he s a slob c est un flemmard\n", "2 i have to try il faut que j essaie\n", "3 i was naive je fus crédule\n", "4 he is bankrupt il est en faillite\n", "5 that s a fact c est un fait\n", "6 talk to me parlez moi\n", "7 keep talking continuez de parler\n", "8 i broke it je l ai cassée\n", "9 tom is a judge tom est juge" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0let me dielaisse moi mourir
1he s a slobc est un flemmard
2i have to tryil faut que j essaie
3i was naiveje fus crédule
4he is bankruptil est en faillite
5that s a factc est un fait
6talk to meparlez moi
7keep talkingcontinuez de parler
8i broke itje l ai cassée
9tom is a judgetom est juge
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"pd\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": 0,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"i broke it\",\n \"he s a slob\",\n \"that s a fact\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 1,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"je l ai cass\\u00e9e\",\n \"c est un flemmard\",\n \"c est un fait\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 6 } ], "source": [ "# Display the result after cleaning\n", "pd.DataFrame(dataset[1000:1010])" ] }, { "cell_type": "code", "execution_count": 7, "id": "275b13e8", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 116 }, "id": "275b13e8", "outputId": "3e708cc0-7e3d-426d-cf56-304df00e544b" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/markdown": "\nTarget (English) Vocabulary Size: 2099" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/markdown": "Target (English) Max Length: 5" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/markdown": "\nSource (French) Vocabulary Size: 4039" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/markdown": "Source (French) Max Length: 12\n" }, "metadata": {} } ], "source": [ "def create_tokenizer(lines):\n", " # fit a tokenizer\n", " tokenizer = Tokenizer()\n", " tokenizer.fit_on_texts(lines)\n", " return tokenizer\n", "\n", "def max_len(lines):\n", " # max sentence length\n", " return max(len(line.split()) for line in lines)\n", "\n", "def encode_sequences(tokenizer, length, lines):\n", " # encode and pad sequences\n", " X = tokenizer.texts_to_sequences(lines) # integer encode sequences\n", " X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values\n", " return X\n", "\n", "def encode_output(sequences, vocab_size):\n", " # one hot encode target sequence\n", " ylist = list()\n", " for sequence in sequences:\n", " encoded = to_categorical(sequence, num_classes=vocab_size)\n", " ylist.append(encoded)\n", " y = np.array(ylist)\n", " y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)\n", " return y\n", "\n", "# Prepare target tokenizer\n", "tar_tokenizer = create_tokenizer(dataset[:, idx_tar])\n", "tar_vocab_size = len(tar_tokenizer.word_index) + 1\n", "tar_length = max_len(dataset[:, idx_tar])\n", "printmd(f'\\nTarget ({target_str}) Vocabulary Size: {tar_vocab_size}')\n", "printmd(f'Target ({target_str}) Max Length: {tar_length}')\n", "\n", "# Prepare source tokenizer\n", "src_tokenizer = create_tokenizer(dataset[:, idx_src])\n", "src_vocab_size = len(src_tokenizer.word_index) + 1\n", "src_length = max_len(dataset[:, idx_src])\n", "printmd(f'\\nSource ({source_str}) Vocabulary Size: {src_vocab_size}')\n", "printmd(f'Source ({source_str}) Max Length: {src_length}\\n')\n", "\n", "# Prepare training data\n", "trainX = encode_sequences(src_tokenizer, src_length, train[:, idx_src])\n", "trainY = encode_sequences(tar_tokenizer, tar_length, train[:, idx_tar])\n", "trainY = encode_output(trainY, tar_vocab_size)\n", "\n", "# Prepare test data\n", "testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src])\n", "testY = encode_sequences(tar_tokenizer, tar_length, test[:, idx_tar])\n", "testY = encode_output(testY, tar_vocab_size)" ] }, { "cell_type": "code", "execution_count": 8, "id": "06fb69d9", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "06fb69d9", "outputId": "ec5dffb6-3bb9-43f0-847d-f9719f1999d5" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/20\n", "127/127 [==============================] - 32s 184ms/step - loss: 4.3580 - val_loss: 3.5034\n", "Epoch 2/20\n", "127/127 [==============================] - 20s 159ms/step - loss: 3.3180 - val_loss: 3.2966\n", "Epoch 3/20\n", "127/127 [==============================] - 29s 232ms/step - loss: 3.1166 - val_loss: 3.1305\n", "Epoch 4/20\n", "127/127 [==============================] - 22s 174ms/step - loss: 2.9387 - val_loss: 3.0357\n", "Epoch 5/20\n", "127/127 [==============================] - 20s 160ms/step - loss: 2.8170 - val_loss: 2.9482\n", "Epoch 6/20\n", "127/127 [==============================] - 22s 171ms/step - loss: 2.7020 - val_loss: 2.8696\n", "Epoch 7/20\n", "127/127 [==============================] - 21s 162ms/step - loss: 2.5850 - val_loss: 2.7787\n", "Epoch 8/20\n", "127/127 [==============================] - 22s 171ms/step - loss: 2.4499 - val_loss: 2.7062\n", "Epoch 9/20\n", "127/127 [==============================] - 22s 177ms/step - loss: 2.3151 - val_loss: 2.5752\n", "Epoch 10/20\n", "127/127 [==============================] - 21s 163ms/step - loss: 2.1780 - val_loss: 2.4899\n", "Epoch 11/20\n", "127/127 [==============================] - 22s 171ms/step - loss: 2.0454 - val_loss: 2.3923\n", "Epoch 12/20\n", "127/127 [==============================] - 20s 160ms/step - loss: 1.9261 - val_loss: 2.3220\n", "Epoch 13/20\n", "127/127 [==============================] - 22s 175ms/step - loss: 1.8146 - val_loss: 2.2600\n", "Epoch 14/20\n", "127/127 [==============================] - 20s 160ms/step - loss: 1.7014 - val_loss: 2.1994\n", "Epoch 15/20\n", "127/127 [==============================] - 22s 171ms/step - loss: 1.5957 - val_loss: 2.1526\n", "Epoch 16/20\n", "127/127 [==============================] - 20s 160ms/step - loss: 1.4959 - val_loss: 2.1011\n", "Epoch 17/20\n", "127/127 [==============================] - 22s 173ms/step - loss: 1.4070 - val_loss: 2.0468\n", "Epoch 18/20\n", "127/127 [==============================] - 20s 160ms/step - loss: 1.3184 - val_loss: 2.0204\n", "Epoch 19/20\n", "127/127 [==============================] - 22s 170ms/step - loss: 1.2317 - val_loss: 1.9782\n", "Epoch 20/20\n", "127/127 [==============================] - 20s 161ms/step - loss: 1.1525 - val_loss: 1.9759\n" ] } ], "source": [ "def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):\n", " # Create the model\n", " model = Sequential()\n", " model.add(Embedding(src_vocab_size, n_units, input_length=src_length, mask_zero=True))\n", " model.add(LSTM(n_units))\n", " model.add(RepeatVector(tar_timesteps))\n", " model.add(LSTM(n_units, return_sequences=True))\n", " model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))\n", " return model\n", "\n", "# Create model\n", "model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256)\n", "model.compile(optimizer='adam', loss='categorical_crossentropy')\n", "\n", "history = model.fit(trainX,\n", " trainY,\n", " epochs=20,\n", " batch_size=64,\n", " validation_split=0.1,\n", " verbose=1,\n", " callbacks=[\n", " EarlyStopping(\n", " monitor='val_loss',\n", " patience=10,\n", " restore_best_weights=True\n", " )\n", " ])" ] }, { "cell_type": "code", "execution_count": 9, "id": "6b90c23c", "metadata": { "id": "6b90c23c", "colab": { "base_uri": "https://localhost:8080/", "height": 452 }, "outputId": "07f0bc72-13d7-4709-c8e4-ed4cd1bbfb85" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "pd.DataFrame(history.history).plot()\n", "plt.title(\"Loss\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "id": "138d6368", "metadata": { "id": "138d6368", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9b93e877-e123-40d1-cf72-c19e9ba617e0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "### Result on the Training Set ###\n", "FRENCH (SOURCE) ENGLISH (TARGET) AUTOMATIC TRANSLATION IN ENGLISH\n", "\n", "nous en savons assez we know enough we re\n", "garde ton sang froid stay calm keep calm\n", "je ne pleurerai pas i won t cry i didn t go\n", "je ne suis pas contente i m not happy i m not busy\n", "moi je veux ça i want that i ll try\n", "j étais tellement heureuse i was so happy i m too busy\n", "j aime le printemps i like spring i like cookies\n", "c est mon garçon that s my boy it s my dog\n", "mille mercis many thanks thanks a\n", "quelle horreur how horrible how nothing\n", "soyez satisfaites be content be content\n", "toi décide you decide you promised\n", "je m en suis remis i recovered i recovered\n", "ce sont les affaires it s business it is good\n", "je dois m en aller i need to go i must to go\n", "sommes nous prêtes are we ready are we kidding\n", "arrêtez de crier stop shouting stop grumbling\n", "je lis souvent i often read i m ashamed\n", "les plantes croissent plants grow plants stinks\n", "il m a fallu le faire i had to do it i had to it\n", "nous éclatâmes de rire we broke up we lost\n", "\n", "\n", "### Result on the Test Set ###\n", "FRENCH (SOURCE) ENGLISH (TARGET) AUTOMATIC TRANSLATION IN ENGLISH\n", "\n", "ils ont abandonné they gave up they lost\n", "rappelle moi call me back help me\n", "je veux essayer i want to try i want to you\n", "ça fonctionne bien it works well it was hard\n", "grimpe dans la camionnette get in the van get on the bus\n", "je suis mince i m thin i m innocent\n", "elle semble riche she seems rich she sued well\n", "ça me gave this annoys me i m wrong\n", "c était long it was long how thrilling\n", "c était un mensonge it was a lie it was a lie\n", "conduis toi en homme act like a man get to sleep\n", "laissez moi m en occuper leave it to me let me alone\n", "puis je manger ceci may i eat this can i go it\n", "devine make a guess let s go\n", "je ne suis pas jolie i m not pretty i m not fat\n", "demande à quiconque ask anyone stop clichés\n", "venez nous rejoindre come join us are us\n", "vous ennuyez vous are you bored you you you\n", "je ne viendrai pas i won t come i can t go\n", "c est un voleur he is a thief it s a joke\n", "bien joué well done good job\n" ] } ], "source": [ "def word_for_id(integer, tokenizer):\n", " # map an integer to a word\n", " for word, index in tokenizer.word_index.items():\n", " if index == integer:\n", " return word\n", " return None\n", "\n", "def predict_seq(model, tokenizer, source):\n", " # generate target from a source sequence\n", " prediction = model.predict(source, verbose=0)[0]\n", " integers = [np.argmax(vector) for vector in prediction]\n", " target = list()\n", " for i in integers:\n", " word = word_for_id(i, tokenizer)\n", " if word is None:\n", " break\n", " target.append(word)\n", " return ' '.join(target)\n", "\n", "def compare_prediction(model, tokenizer, sources, raw_dataset, limit=20):\n", " # evaluate a model\n", " actual, predicted = [], []\n", " src = f'{source_str.upper()} (SOURCE)'\n", " tgt = f'{target_str.upper()} (TARGET)'\n", " pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}'\n", " print(f'{src:30} {tgt:25} {pred}\\n')\n", "\n", " for i, source in enumerate(sources): # translate encoded source text\n", " source = source.reshape((1, source.shape[0]))\n", " translation = predict_seq(model, tokenizer, source)\n", " raw_target, raw_src = raw_dataset[i]\n", " print(f'{raw_src:30} {raw_target:25} {translation}')\n", " if i >= limit: # Display some of the result\n", " break\n", "\n", "# test on some training sequences\n", "print('### Result on the Training Set ###')\n", "compare_prediction(model, tar_tokenizer, trainX, train)\n", "\n", "# test on some test sequences\n", "print('\\n\\n### Result on the Test Set ###')\n", "compare_prediction(model, tar_tokenizer, testX, test)" ] }, { "cell_type": "code", "execution_count": 11, "id": "2e935484", "metadata": { "id": "2e935484" }, "outputs": [], "source": [ "# It takes long to compute the BLEU Score\n", "\n", "def bleu_score(model, tokenizer, sources, raw_dataset):\n", " # Get the bleu score of a model\n", " actual, predicted = [], []\n", " for i, source in enumerate(sources):\n", " # translate encoded source text\n", " source = source.reshape((1, source.shape[0]))\n", " translation = predict_seq(model, tar_tokenizer, source)\n", " raw_target, raw_src = raw_dataset[i]\n", " actual.append([raw_target.split()])\n", " predicted.append(translation.split())\n", "\n", " bleu_dic = {}\n", " bleu_dic['1-grams'] = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))\n", " bleu_dic['1-2-grams'] = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))\n", " bleu_dic['1-3-grams'] = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))\n", " bleu_dic['1-4-grams'] = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))\n", "\n", " return bleu_dic\n", "\n", "# Compute the BLEU Score\n", "bleu_train = bleu_score(model, tar_tokenizer, trainX, train)\n", "bleu_test = bleu_score(model, tar_tokenizer, testX, test)" ] }, { "cell_type": "code", "execution_count": 12, "id": "d955dd33", "metadata": { "id": "d955dd33", "colab": { "base_uri": "https://localhost:8080/", "height": 452 }, "outputId": "abcb44b3-1ea2-407b-c09b-a4e31a918949" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "plt.bar(x = bleu_train.keys(), height = bleu_train.values())\n", "plt.title(\"BLEU Score with the training set\")\n", "plt.ylim((0,1))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 13, "id": "f3cf03db", "metadata": { "id": "f3cf03db", "colab": { "base_uri": "https://localhost:8080/", "height": 452 }, "outputId": "bc1f4f46-5e1b-4beb-8c6a-c0be5257b6fb" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "plt.bar(x = bleu_test.keys(), height = bleu_test.values())\n", "plt.title(\"BLEU Score with the test set\")\n", "plt.ylim((0,1))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 18, "id": "LYE8QofL1XC1", "metadata": { "id": "LYE8QofL1XC1" }, "outputs": [], "source": [ "model.save('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')" ] }, { "cell_type": "code", "execution_count": 19, "id": "PoS_noGF1eXf", "metadata": { "id": "PoS_noGF1eXf", "colab": { "base_uri": "https://localhost:8080/", "height": 626 }, "outputId": "ad5e59c7-b064-4eb0-a85b-5e76d7be1bfb" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", "\n", "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", "Running on public URL: https://2ebce967724a96d7c0.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "
" ] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [] }, "metadata": {}, "execution_count": 19 } ], "source": [ "import gradio as gr\n", "\n", "# Load the trained model\n", "model = load_model('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')\n", "\n", "# Function to translate French to English\n", "def translate_french_to_english(french_sentence):\n", " # Clean the input sentence\n", " french_sentence = clean(french_sentence)\n", " # Tokenize and pad the input sentence\n", " input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence])\n", " # Generate the translation\n", " english_translation = predict_seq(model, tar_tokenizer, input_sequence)\n", " return english_translation\n", "\n", "# Create a Gradio interface\n", "gr.Interface(\n", " fn=translate_french_to_english,\n", " inputs=\"text\",\n", " outputs=\"text\",\n", " title=\"French to English Translator\",\n", " description=\"Translate French sentences to English.\"\n", ").launch()" ] } ], "metadata": { "colab": { "provenance": [] }, "kaggle": { "accelerator": "nvidiaTeslaT4", "dataSources": [ { "datasetId": 592212, "sourceId": 1067156, "sourceType": "datasetVersion" } ], "dockerImageVersionId": 30260, "isGpuEnabled": true, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" }, "papermill": { "default_parameters": {}, "duration": 3017.154782, "end_time": "2024-01-11T16:19:30.323673", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2024-01-11T15:29:13.168891", "version": "2.3.4" } }, "nbformat": 4, "nbformat_minor": 5 }