# -*- coding: utf-8 -*- """french-to-english-translation-using-seq2seq.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1I_pfLKfUYqIWiX3przMoSFvczO_H83er """ import warnings warnings.filterwarnings('ignore') import string import re from unicodedata import normalize import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.models import Sequential,load_model from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed from keras.callbacks import EarlyStopping from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from nltk.translate.bleu_score import corpus_bleu import pandas as pd from string import punctuation import matplotlib.pyplot as plt from IPython.display import Markdown, display def printmd(string): # Print with Markdowns display(Markdown(string)) from google.colab import drive drive.mount('/content/drive') total_sentences = 10000 # Load the dataset dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/eng_-french.csv", nrows = total_sentences) # What proportion of the sentences will be used for the test set test_proportion = 0.1 train_test_threshold = int( (1-test_proportion) * total_sentences) printmd(f'## {total_sentences} "parallel sentences" will be loaded (original sentence + its translation)') printmd(f'## {train_test_threshold} "parallel sentences" will be used to train the model') printmd(f'## {total_sentences-train_test_threshold} "parallel sentences" will be used to test the model') # Shuffle the dataset dataset = dataset.sample(frac=1, random_state=0) dataset.iloc[1000:1010] def clean(string): # Clean the string string = string.replace("\u202f"," ") # Replace no-break space with space string = string.lower() # Delete the punctuation and the numbers for p in punctuation + "«»" + "0123456789": string = string.replace(p," ") string = re.sub('\s+',' ', string) string = string.strip() return string # Clean the sentences dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x)) dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x)) # Select one part of the dataset dataset = dataset.values dataset = dataset[:total_sentences] # split into train/test train, test = dataset[:train_test_threshold], dataset[train_test_threshold:] # Define the name of the source and of the target # This will be used in the outputs of this notebook source_str, target_str = "French", "English" # The index in the numpy array of the source and of the target idx_src, idx_tar = 1, 0 # Display the result after cleaning pd.DataFrame(dataset[1000:1010]) def create_tokenizer(lines): # fit a tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer def max_len(lines): # max sentence length return max(len(line.split()) for line in lines) def encode_sequences(tokenizer, length, lines): # encode and pad sequences X = tokenizer.texts_to_sequences(lines) # integer encode sequences X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values return X def encode_output(sequences, vocab_size): # one hot encode target sequence ylist = list() for sequence in sequences: encoded = to_categorical(sequence, num_classes=vocab_size) ylist.append(encoded) y = np.array(ylist) y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size) return y # Prepare target tokenizer tar_tokenizer = create_tokenizer(dataset[:, idx_tar]) tar_vocab_size = len(tar_tokenizer.word_index) + 1 tar_length = max_len(dataset[:, idx_tar]) printmd(f'\nTarget ({target_str}) Vocabulary Size: {tar_vocab_size}') printmd(f'Target ({target_str}) Max Length: {tar_length}') # Prepare source tokenizer src_tokenizer = create_tokenizer(dataset[:, idx_src]) src_vocab_size = len(src_tokenizer.word_index) + 1 src_length = max_len(dataset[:, idx_src]) printmd(f'\nSource ({source_str}) Vocabulary Size: {src_vocab_size}') printmd(f'Source ({source_str}) Max Length: {src_length}\n') # Prepare training data trainX = encode_sequences(src_tokenizer, src_length, train[:, idx_src]) trainY = encode_sequences(tar_tokenizer, tar_length, train[:, idx_tar]) trainY = encode_output(trainY, tar_vocab_size) # Prepare test data testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src]) testY = encode_sequences(tar_tokenizer, tar_length, test[:, idx_tar]) testY = encode_output(testY, tar_vocab_size) def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units): # Create the model model = Sequential() model.add(Embedding(src_vocab_size, n_units, input_length=src_length, mask_zero=True)) model.add(LSTM(n_units)) model.add(RepeatVector(tar_timesteps)) model.add(LSTM(n_units, return_sequences=True)) model.add(TimeDistributed(Dense(tar_vocab, activation='softmax'))) return model # Create model model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256) model.compile(optimizer='adam', loss='categorical_crossentropy') history = model.fit(trainX, trainY, epochs=20, batch_size=64, validation_split=0.1, verbose=1, callbacks=[ EarlyStopping( monitor='val_loss', patience=10, restore_best_weights=True ) ]) pd.DataFrame(history.history).plot() plt.title("Loss") plt.show() def word_for_id(integer, tokenizer): # map an integer to a word for word, index in tokenizer.word_index.items(): if index == integer: return word return None def predict_seq(model, tokenizer, source): # generate target from a source sequence prediction = model.predict(source, verbose=0)[0] integers = [np.argmax(vector) for vector in prediction] target = list() for i in integers: word = word_for_id(i, tokenizer) if word is None: break target.append(word) return ' '.join(target) def compare_prediction(model, tokenizer, sources, raw_dataset, limit=20): # evaluate a model actual, predicted = [], [] src = f'{source_str.upper()} (SOURCE)' tgt = f'{target_str.upper()} (TARGET)' pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}' print(f'{src:30} {tgt:25} {pred}\n') for i, source in enumerate(sources): # translate encoded source text source = source.reshape((1, source.shape[0])) translation = predict_seq(model, tokenizer, source) raw_target, raw_src = raw_dataset[i] print(f'{raw_src:30} {raw_target:25} {translation}') if i >= limit: # Display some of the result break # test on some training sequences print('### Result on the Training Set ###') compare_prediction(model, tar_tokenizer, trainX, train) # test on some test sequences print('\n\n### Result on the Test Set ###') compare_prediction(model, tar_tokenizer, testX, test) # It takes long to compute the BLEU Score def bleu_score(model, tokenizer, sources, raw_dataset): # Get the bleu score of a model actual, predicted = [], [] for i, source in enumerate(sources): # translate encoded source text source = source.reshape((1, source.shape[0])) translation = predict_seq(model, tar_tokenizer, source) raw_target, raw_src = raw_dataset[i] actual.append([raw_target.split()]) predicted.append(translation.split()) bleu_dic = {} bleu_dic['1-grams'] = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)) bleu_dic['1-2-grams'] = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)) bleu_dic['1-3-grams'] = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)) bleu_dic['1-4-grams'] = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)) return bleu_dic # Compute the BLEU Score bleu_train = bleu_score(model, tar_tokenizer, trainX, train) bleu_test = bleu_score(model, tar_tokenizer, testX, test) plt.bar(x = bleu_train.keys(), height = bleu_train.values()) plt.title("BLEU Score with the training set") plt.ylim((0,1)) plt.show() plt.bar(x = bleu_test.keys(), height = bleu_test.values()) plt.title("BLEU Score with the test set") plt.ylim((0,1)) plt.show() model.save('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5') import gradio as gr # Load the trained model model = load_model('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5') # Function to translate French to English def translate_french_to_english(french_sentence): # Clean the input sentence french_sentence = clean(french_sentence) # Tokenize and pad the input sentence input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence]) # Generate the translation english_translation = predict_seq(model, tar_tokenizer, input_sequence) return english_translation # Create a Gradio interface gr.Interface( fn=translate_french_to_english, inputs="text", outputs="text", title="French to English Translator", description="Translate French sentences to English." ).launch()