Upload french_to_english_translation_using_seq2seq.py

Browse files

Files changed (1) hide show

french_to_english_translation_using_seq2seq.py +273 -0

french_to_english_translation_using_seq2seq.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# -*- coding: utf-8 -*-
+"""french-to-english-translation-using-seq2seq.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1I_pfLKfUYqIWiX3przMoSFvczO_H83er
+"""
+import warnings
+warnings.filterwarnings('ignore')
+import string
+import re
+from unicodedata import normalize
+import numpy as np
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
+from keras.models import Sequential,load_model
+from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
+from keras.callbacks import EarlyStopping
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from nltk.translate.bleu_score import corpus_bleu
+import pandas as pd
+from string import punctuation
+import matplotlib.pyplot as plt
+from IPython.display import Markdown, display
+def printmd(string):
+    # Print with Markdowns
+    display(Markdown(string))
+from google.colab import drive
+drive.mount('/content/drive')
+total_sentences = 10000
+# Load the dataset
+dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/eng_-french.csv", nrows = total_sentences)
+# What proportion of the sentences will be used for the test set
+test_proportion = 0.1
+train_test_threshold = int( (1-test_proportion) * total_sentences)
+printmd(f'## {total_sentences} "parallel sentences" will be loaded (original sentence + its translation)')
+printmd(f'## {train_test_threshold} "parallel sentences" will be used to train the model')
+printmd(f'## {total_sentences-train_test_threshold} "parallel sentences" will be used to test the model')
+# Shuffle the dataset
+dataset = dataset.sample(frac=1, random_state=0)
+dataset.iloc[1000:1010]
+def clean(string):
+    # Clean the string
+    string = string.replace("\u202f"," ") # Replace no-break space with space
+    string = string.lower()
+    # Delete the punctuation and the numbers
+    for p in punctuation + "«»" + "0123456789":
+        string = string.replace(p," ")
+    string = re.sub('\s+',' ', string)
+    string = string.strip()
+    return string
+# Clean the sentences
+dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x))
+dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x))
+# Select one part of the dataset
+dataset = dataset.values
+dataset = dataset[:total_sentences]
+# split into train/test
+train, test = dataset[:train_test_threshold], dataset[train_test_threshold:]
+# Define the name of the source and of the target
+# This will be used in the outputs of this notebook
+source_str, target_str = "French", "English"
+# The index in the numpy array of the source and of the target
+idx_src, idx_tar = 1, 0
+# Display the result after cleaning
+pd.DataFrame(dataset[1000:1010])
+def create_tokenizer(lines):
+    # fit a tokenizer
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(lines)
+    return tokenizer
+def max_len(lines):
+    # max sentence length
+    return max(len(line.split()) for line in lines)
+def encode_sequences(tokenizer, length, lines):
+    # encode and pad sequences
+    X = tokenizer.texts_to_sequences(lines) # integer encode sequences
+    X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values
+    return X
+def encode_output(sequences, vocab_size):
+    # one hot encode target sequence
+    ylist = list()
+    for sequence in sequences:
+        encoded = to_categorical(sequence, num_classes=vocab_size)
+        ylist.append(encoded)
+    y = np.array(ylist)
+    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
+    return y
+# Prepare target tokenizer
+tar_tokenizer = create_tokenizer(dataset[:, idx_tar])
+tar_vocab_size = len(tar_tokenizer.word_index) + 1
+tar_length = max_len(dataset[:, idx_tar])
+printmd(f'\nTarget ({target_str}) Vocabulary Size: {tar_vocab_size}')
+printmd(f'Target ({target_str}) Max Length: {tar_length}')
+# Prepare source tokenizer
+src_tokenizer = create_tokenizer(dataset[:, idx_src])
+src_vocab_size = len(src_tokenizer.word_index) + 1
+src_length = max_len(dataset[:, idx_src])
+printmd(f'\nSource ({source_str}) Vocabulary Size: {src_vocab_size}')
+printmd(f'Source ({source_str}) Max Length: {src_length}\n')
+# Prepare training data
+trainX = encode_sequences(src_tokenizer, src_length, train[:, idx_src])
+trainY = encode_sequences(tar_tokenizer, tar_length, train[:, idx_tar])
+trainY = encode_output(trainY, tar_vocab_size)
+# Prepare test data
+testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src])
+testY = encode_sequences(tar_tokenizer, tar_length, test[:, idx_tar])
+testY = encode_output(testY, tar_vocab_size)
+def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
+    # Create the model
+    model = Sequential()
+    model.add(Embedding(src_vocab_size, n_units, input_length=src_length, mask_zero=True))
+    model.add(LSTM(n_units))
+    model.add(RepeatVector(tar_timesteps))
+    model.add(LSTM(n_units, return_sequences=True))
+    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
+    return model
+# Create model
+model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256)
+model.compile(optimizer='adam', loss='categorical_crossentropy')
+history = model.fit(trainX,
+          trainY,
+          epochs=20,
+          batch_size=64,
+          validation_split=0.1,
+          verbose=1,
+          callbacks=[
+                        EarlyStopping(
+                        monitor='val_loss',
+                        patience=10,
+                        restore_best_weights=True
+                    )
+            ])
+pd.DataFrame(history.history).plot()
+plt.title("Loss")
+plt.show()
+def word_for_id(integer, tokenizer):
+    # map an integer to a word
+    for word, index in tokenizer.word_index.items():
+        if index == integer:
+            return word
+    return None
+def predict_seq(model, tokenizer, source):
+    # generate target from a source sequence
+    prediction = model.predict(source, verbose=0)[0]
+    integers = [np.argmax(vector) for vector in prediction]
+    target = list()
+    for i in integers:
+        word = word_for_id(i, tokenizer)
+        if word is None:
+            break
+        target.append(word)
+    return ' '.join(target)
+def compare_prediction(model, tokenizer, sources, raw_dataset, limit=20):
+    # evaluate a model
+    actual, predicted = [], []
+    src = f'{source_str.upper()} (SOURCE)'
+    tgt = f'{target_str.upper()} (TARGET)'
+    pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}'
+    print(f'{src:30} {tgt:25} {pred}\n')
+    for i, source in enumerate(sources): # translate encoded source text
+        source = source.reshape((1, source.shape[0]))
+        translation = predict_seq(model, tokenizer, source)
+        raw_target, raw_src = raw_dataset[i]
+        print(f'{raw_src:30} {raw_target:25} {translation}')
+        if i >= limit: # Display some of the result
+            break
+# test on some training sequences
+print('### Result on the Training Set ###')
+compare_prediction(model, tar_tokenizer, trainX, train)
+# test on some test sequences
+print('\n\n### Result on the Test Set ###')
+compare_prediction(model, tar_tokenizer, testX, test)
+# It takes long to compute the BLEU Score
+def bleu_score(model, tokenizer, sources, raw_dataset):
+    # Get the bleu score of a model
+    actual, predicted = [], []
+    for i, source in enumerate(sources):
+        # translate encoded source text
+        source = source.reshape((1, source.shape[0]))
+        translation = predict_seq(model, tar_tokenizer, source)
+        raw_target, raw_src = raw_dataset[i]
+        actual.append([raw_target.split()])
+        predicted.append(translation.split())
+    bleu_dic = {}
+    bleu_dic['1-grams'] = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
+    bleu_dic['1-2-grams'] = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
+    bleu_dic['1-3-grams'] = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
+    bleu_dic['1-4-grams'] = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
+    return bleu_dic
+# Compute the BLEU Score
+bleu_train = bleu_score(model, tar_tokenizer, trainX, train)
+bleu_test = bleu_score(model, tar_tokenizer, testX, test)
+plt.bar(x = bleu_train.keys(), height = bleu_train.values())
+plt.title("BLEU Score with the training set")
+plt.ylim((0,1))
+plt.show()
+plt.bar(x = bleu_test.keys(), height = bleu_test.values())
+plt.title("BLEU Score with the test set")
+plt.ylim((0,1))
+plt.show()
+model.save('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')
+import gradio as gr
+# Load the trained model
+model = load_model('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')
+# Function to translate French to English
+def translate_french_to_english(french_sentence):
+    # Clean the input sentence
+    french_sentence = clean(french_sentence)
+    # Tokenize and pad the input sentence
+    input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence])
+    # Generate the translation
+    english_translation = predict_seq(model, tar_tokenizer, input_sequence)
+    return english_translation
+# Create a Gradio interface
+gr.Interface(
+    fn=translate_french_to_english,
+    inputs="text",
+    outputs="text",
+    title="French to English Translator",
+    description="Translate French sentences to English."
+).launch()