KameliaZaman
commited on
Commit
•
918fb26
1
Parent(s):
a81b451
Upload french_to_english_translation_using_seq2seq.py
Browse files
french_to_english_translation_using_seq2seq.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""french-to-english-translation-using-seq2seq.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1I_pfLKfUYqIWiX3przMoSFvczO_H83er
|
8 |
+
"""
|
9 |
+
|
10 |
+
import warnings
|
11 |
+
warnings.filterwarnings('ignore')
|
12 |
+
import string
|
13 |
+
import re
|
14 |
+
from unicodedata import normalize
|
15 |
+
import numpy as np
|
16 |
+
from keras.preprocessing.text import Tokenizer
|
17 |
+
from keras.preprocessing.sequence import pad_sequences
|
18 |
+
from keras.utils import to_categorical
|
19 |
+
from keras.models import Sequential,load_model
|
20 |
+
from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
|
21 |
+
from keras.callbacks import EarlyStopping
|
22 |
+
from keras.preprocessing.text import Tokenizer
|
23 |
+
from keras.preprocessing.sequence import pad_sequences
|
24 |
+
from nltk.translate.bleu_score import corpus_bleu
|
25 |
+
import pandas as pd
|
26 |
+
from string import punctuation
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
from IPython.display import Markdown, display
|
29 |
+
|
30 |
+
def printmd(string):
|
31 |
+
# Print with Markdowns
|
32 |
+
display(Markdown(string))
|
33 |
+
|
34 |
+
from google.colab import drive
|
35 |
+
drive.mount('/content/drive')
|
36 |
+
|
37 |
+
total_sentences = 10000
|
38 |
+
|
39 |
+
# Load the dataset
|
40 |
+
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/eng_-french.csv", nrows = total_sentences)
|
41 |
+
|
42 |
+
# What proportion of the sentences will be used for the test set
|
43 |
+
test_proportion = 0.1
|
44 |
+
train_test_threshold = int( (1-test_proportion) * total_sentences)
|
45 |
+
|
46 |
+
printmd(f'## {total_sentences} "parallel sentences" will be loaded (original sentence + its translation)')
|
47 |
+
printmd(f'## {train_test_threshold} "parallel sentences" will be used to train the model')
|
48 |
+
printmd(f'## {total_sentences-train_test_threshold} "parallel sentences" will be used to test the model')
|
49 |
+
|
50 |
+
# Shuffle the dataset
|
51 |
+
dataset = dataset.sample(frac=1, random_state=0)
|
52 |
+
dataset.iloc[1000:1010]
|
53 |
+
|
54 |
+
def clean(string):
|
55 |
+
# Clean the string
|
56 |
+
string = string.replace("\u202f"," ") # Replace no-break space with space
|
57 |
+
string = string.lower()
|
58 |
+
|
59 |
+
# Delete the punctuation and the numbers
|
60 |
+
for p in punctuation + "«»" + "0123456789":
|
61 |
+
string = string.replace(p," ")
|
62 |
+
|
63 |
+
string = re.sub('\s+',' ', string)
|
64 |
+
string = string.strip()
|
65 |
+
|
66 |
+
return string
|
67 |
+
|
68 |
+
# Clean the sentences
|
69 |
+
dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x))
|
70 |
+
dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x))
|
71 |
+
|
72 |
+
# Select one part of the dataset
|
73 |
+
dataset = dataset.values
|
74 |
+
dataset = dataset[:total_sentences]
|
75 |
+
|
76 |
+
# split into train/test
|
77 |
+
train, test = dataset[:train_test_threshold], dataset[train_test_threshold:]
|
78 |
+
|
79 |
+
# Define the name of the source and of the target
|
80 |
+
# This will be used in the outputs of this notebook
|
81 |
+
source_str, target_str = "French", "English"
|
82 |
+
|
83 |
+
# The index in the numpy array of the source and of the target
|
84 |
+
idx_src, idx_tar = 1, 0
|
85 |
+
|
86 |
+
# Display the result after cleaning
|
87 |
+
pd.DataFrame(dataset[1000:1010])
|
88 |
+
|
89 |
+
def create_tokenizer(lines):
|
90 |
+
# fit a tokenizer
|
91 |
+
tokenizer = Tokenizer()
|
92 |
+
tokenizer.fit_on_texts(lines)
|
93 |
+
return tokenizer
|
94 |
+
|
95 |
+
def max_len(lines):
|
96 |
+
# max sentence length
|
97 |
+
return max(len(line.split()) for line in lines)
|
98 |
+
|
99 |
+
def encode_sequences(tokenizer, length, lines):
|
100 |
+
# encode and pad sequences
|
101 |
+
X = tokenizer.texts_to_sequences(lines) # integer encode sequences
|
102 |
+
X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values
|
103 |
+
return X
|
104 |
+
|
105 |
+
def encode_output(sequences, vocab_size):
|
106 |
+
# one hot encode target sequence
|
107 |
+
ylist = list()
|
108 |
+
for sequence in sequences:
|
109 |
+
encoded = to_categorical(sequence, num_classes=vocab_size)
|
110 |
+
ylist.append(encoded)
|
111 |
+
y = np.array(ylist)
|
112 |
+
y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
|
113 |
+
return y
|
114 |
+
|
115 |
+
# Prepare target tokenizer
|
116 |
+
tar_tokenizer = create_tokenizer(dataset[:, idx_tar])
|
117 |
+
tar_vocab_size = len(tar_tokenizer.word_index) + 1
|
118 |
+
tar_length = max_len(dataset[:, idx_tar])
|
119 |
+
printmd(f'\nTarget ({target_str}) Vocabulary Size: {tar_vocab_size}')
|
120 |
+
printmd(f'Target ({target_str}) Max Length: {tar_length}')
|
121 |
+
|
122 |
+
# Prepare source tokenizer
|
123 |
+
src_tokenizer = create_tokenizer(dataset[:, idx_src])
|
124 |
+
src_vocab_size = len(src_tokenizer.word_index) + 1
|
125 |
+
src_length = max_len(dataset[:, idx_src])
|
126 |
+
printmd(f'\nSource ({source_str}) Vocabulary Size: {src_vocab_size}')
|
127 |
+
printmd(f'Source ({source_str}) Max Length: {src_length}\n')
|
128 |
+
|
129 |
+
# Prepare training data
|
130 |
+
trainX = encode_sequences(src_tokenizer, src_length, train[:, idx_src])
|
131 |
+
trainY = encode_sequences(tar_tokenizer, tar_length, train[:, idx_tar])
|
132 |
+
trainY = encode_output(trainY, tar_vocab_size)
|
133 |
+
|
134 |
+
# Prepare test data
|
135 |
+
testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src])
|
136 |
+
testY = encode_sequences(tar_tokenizer, tar_length, test[:, idx_tar])
|
137 |
+
testY = encode_output(testY, tar_vocab_size)
|
138 |
+
|
139 |
+
def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
|
140 |
+
# Create the model
|
141 |
+
model = Sequential()
|
142 |
+
model.add(Embedding(src_vocab_size, n_units, input_length=src_length, mask_zero=True))
|
143 |
+
model.add(LSTM(n_units))
|
144 |
+
model.add(RepeatVector(tar_timesteps))
|
145 |
+
model.add(LSTM(n_units, return_sequences=True))
|
146 |
+
model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
|
147 |
+
return model
|
148 |
+
|
149 |
+
# Create model
|
150 |
+
model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256)
|
151 |
+
model.compile(optimizer='adam', loss='categorical_crossentropy')
|
152 |
+
|
153 |
+
history = model.fit(trainX,
|
154 |
+
trainY,
|
155 |
+
epochs=20,
|
156 |
+
batch_size=64,
|
157 |
+
validation_split=0.1,
|
158 |
+
verbose=1,
|
159 |
+
callbacks=[
|
160 |
+
EarlyStopping(
|
161 |
+
monitor='val_loss',
|
162 |
+
patience=10,
|
163 |
+
restore_best_weights=True
|
164 |
+
)
|
165 |
+
])
|
166 |
+
|
167 |
+
pd.DataFrame(history.history).plot()
|
168 |
+
plt.title("Loss")
|
169 |
+
plt.show()
|
170 |
+
|
171 |
+
def word_for_id(integer, tokenizer):
|
172 |
+
# map an integer to a word
|
173 |
+
for word, index in tokenizer.word_index.items():
|
174 |
+
if index == integer:
|
175 |
+
return word
|
176 |
+
return None
|
177 |
+
|
178 |
+
def predict_seq(model, tokenizer, source):
|
179 |
+
# generate target from a source sequence
|
180 |
+
prediction = model.predict(source, verbose=0)[0]
|
181 |
+
integers = [np.argmax(vector) for vector in prediction]
|
182 |
+
target = list()
|
183 |
+
for i in integers:
|
184 |
+
word = word_for_id(i, tokenizer)
|
185 |
+
if word is None:
|
186 |
+
break
|
187 |
+
target.append(word)
|
188 |
+
return ' '.join(target)
|
189 |
+
|
190 |
+
def compare_prediction(model, tokenizer, sources, raw_dataset, limit=20):
|
191 |
+
# evaluate a model
|
192 |
+
actual, predicted = [], []
|
193 |
+
src = f'{source_str.upper()} (SOURCE)'
|
194 |
+
tgt = f'{target_str.upper()} (TARGET)'
|
195 |
+
pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}'
|
196 |
+
print(f'{src:30} {tgt:25} {pred}\n')
|
197 |
+
|
198 |
+
for i, source in enumerate(sources): # translate encoded source text
|
199 |
+
source = source.reshape((1, source.shape[0]))
|
200 |
+
translation = predict_seq(model, tokenizer, source)
|
201 |
+
raw_target, raw_src = raw_dataset[i]
|
202 |
+
print(f'{raw_src:30} {raw_target:25} {translation}')
|
203 |
+
if i >= limit: # Display some of the result
|
204 |
+
break
|
205 |
+
|
206 |
+
# test on some training sequences
|
207 |
+
print('### Result on the Training Set ###')
|
208 |
+
compare_prediction(model, tar_tokenizer, trainX, train)
|
209 |
+
|
210 |
+
# test on some test sequences
|
211 |
+
print('\n\n### Result on the Test Set ###')
|
212 |
+
compare_prediction(model, tar_tokenizer, testX, test)
|
213 |
+
|
214 |
+
# It takes long to compute the BLEU Score
|
215 |
+
|
216 |
+
def bleu_score(model, tokenizer, sources, raw_dataset):
|
217 |
+
# Get the bleu score of a model
|
218 |
+
actual, predicted = [], []
|
219 |
+
for i, source in enumerate(sources):
|
220 |
+
# translate encoded source text
|
221 |
+
source = source.reshape((1, source.shape[0]))
|
222 |
+
translation = predict_seq(model, tar_tokenizer, source)
|
223 |
+
raw_target, raw_src = raw_dataset[i]
|
224 |
+
actual.append([raw_target.split()])
|
225 |
+
predicted.append(translation.split())
|
226 |
+
|
227 |
+
bleu_dic = {}
|
228 |
+
bleu_dic['1-grams'] = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
|
229 |
+
bleu_dic['1-2-grams'] = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
|
230 |
+
bleu_dic['1-3-grams'] = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
|
231 |
+
bleu_dic['1-4-grams'] = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
|
232 |
+
|
233 |
+
return bleu_dic
|
234 |
+
|
235 |
+
# Compute the BLEU Score
|
236 |
+
bleu_train = bleu_score(model, tar_tokenizer, trainX, train)
|
237 |
+
bleu_test = bleu_score(model, tar_tokenizer, testX, test)
|
238 |
+
|
239 |
+
plt.bar(x = bleu_train.keys(), height = bleu_train.values())
|
240 |
+
plt.title("BLEU Score with the training set")
|
241 |
+
plt.ylim((0,1))
|
242 |
+
plt.show()
|
243 |
+
|
244 |
+
plt.bar(x = bleu_test.keys(), height = bleu_test.values())
|
245 |
+
plt.title("BLEU Score with the test set")
|
246 |
+
plt.ylim((0,1))
|
247 |
+
plt.show()
|
248 |
+
|
249 |
+
model.save('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')
|
250 |
+
|
251 |
+
import gradio as gr
|
252 |
+
|
253 |
+
# Load the trained model
|
254 |
+
model = load_model('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')
|
255 |
+
|
256 |
+
# Function to translate French to English
|
257 |
+
def translate_french_to_english(french_sentence):
|
258 |
+
# Clean the input sentence
|
259 |
+
french_sentence = clean(french_sentence)
|
260 |
+
# Tokenize and pad the input sentence
|
261 |
+
input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence])
|
262 |
+
# Generate the translation
|
263 |
+
english_translation = predict_seq(model, tar_tokenizer, input_sequence)
|
264 |
+
return english_translation
|
265 |
+
|
266 |
+
# Create a Gradio interface
|
267 |
+
gr.Interface(
|
268 |
+
fn=translate_french_to_english,
|
269 |
+
inputs="text",
|
270 |
+
outputs="text",
|
271 |
+
title="French to English Translator",
|
272 |
+
description="Translate French sentences to English."
|
273 |
+
).launch()
|