KameliaZaman commited on
Commit
918fb26
1 Parent(s): a81b451

Upload french_to_english_translation_using_seq2seq.py

Browse files
french_to_english_translation_using_seq2seq.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """french-to-english-translation-using-seq2seq.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1I_pfLKfUYqIWiX3przMoSFvczO_H83er
8
+ """
9
+
10
+ import warnings
11
+ warnings.filterwarnings('ignore')
12
+ import string
13
+ import re
14
+ from unicodedata import normalize
15
+ import numpy as np
16
+ from keras.preprocessing.text import Tokenizer
17
+ from keras.preprocessing.sequence import pad_sequences
18
+ from keras.utils import to_categorical
19
+ from keras.models import Sequential,load_model
20
+ from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
21
+ from keras.callbacks import EarlyStopping
22
+ from keras.preprocessing.text import Tokenizer
23
+ from keras.preprocessing.sequence import pad_sequences
24
+ from nltk.translate.bleu_score import corpus_bleu
25
+ import pandas as pd
26
+ from string import punctuation
27
+ import matplotlib.pyplot as plt
28
+ from IPython.display import Markdown, display
29
+
30
+ def printmd(string):
31
+ # Print with Markdowns
32
+ display(Markdown(string))
33
+
34
+ from google.colab import drive
35
+ drive.mount('/content/drive')
36
+
37
+ total_sentences = 10000
38
+
39
+ # Load the dataset
40
+ dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/eng_-french.csv", nrows = total_sentences)
41
+
42
+ # What proportion of the sentences will be used for the test set
43
+ test_proportion = 0.1
44
+ train_test_threshold = int( (1-test_proportion) * total_sentences)
45
+
46
+ printmd(f'## {total_sentences} "parallel sentences" will be loaded (original sentence + its translation)')
47
+ printmd(f'## {train_test_threshold} "parallel sentences" will be used to train the model')
48
+ printmd(f'## {total_sentences-train_test_threshold} "parallel sentences" will be used to test the model')
49
+
50
+ # Shuffle the dataset
51
+ dataset = dataset.sample(frac=1, random_state=0)
52
+ dataset.iloc[1000:1010]
53
+
54
+ def clean(string):
55
+ # Clean the string
56
+ string = string.replace("\u202f"," ") # Replace no-break space with space
57
+ string = string.lower()
58
+
59
+ # Delete the punctuation and the numbers
60
+ for p in punctuation + "«»" + "0123456789":
61
+ string = string.replace(p," ")
62
+
63
+ string = re.sub('\s+',' ', string)
64
+ string = string.strip()
65
+
66
+ return string
67
+
68
+ # Clean the sentences
69
+ dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x))
70
+ dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x))
71
+
72
+ # Select one part of the dataset
73
+ dataset = dataset.values
74
+ dataset = dataset[:total_sentences]
75
+
76
+ # split into train/test
77
+ train, test = dataset[:train_test_threshold], dataset[train_test_threshold:]
78
+
79
+ # Define the name of the source and of the target
80
+ # This will be used in the outputs of this notebook
81
+ source_str, target_str = "French", "English"
82
+
83
+ # The index in the numpy array of the source and of the target
84
+ idx_src, idx_tar = 1, 0
85
+
86
+ # Display the result after cleaning
87
+ pd.DataFrame(dataset[1000:1010])
88
+
89
+ def create_tokenizer(lines):
90
+ # fit a tokenizer
91
+ tokenizer = Tokenizer()
92
+ tokenizer.fit_on_texts(lines)
93
+ return tokenizer
94
+
95
+ def max_len(lines):
96
+ # max sentence length
97
+ return max(len(line.split()) for line in lines)
98
+
99
+ def encode_sequences(tokenizer, length, lines):
100
+ # encode and pad sequences
101
+ X = tokenizer.texts_to_sequences(lines) # integer encode sequences
102
+ X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values
103
+ return X
104
+
105
+ def encode_output(sequences, vocab_size):
106
+ # one hot encode target sequence
107
+ ylist = list()
108
+ for sequence in sequences:
109
+ encoded = to_categorical(sequence, num_classes=vocab_size)
110
+ ylist.append(encoded)
111
+ y = np.array(ylist)
112
+ y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
113
+ return y
114
+
115
+ # Prepare target tokenizer
116
+ tar_tokenizer = create_tokenizer(dataset[:, idx_tar])
117
+ tar_vocab_size = len(tar_tokenizer.word_index) + 1
118
+ tar_length = max_len(dataset[:, idx_tar])
119
+ printmd(f'\nTarget ({target_str}) Vocabulary Size: {tar_vocab_size}')
120
+ printmd(f'Target ({target_str}) Max Length: {tar_length}')
121
+
122
+ # Prepare source tokenizer
123
+ src_tokenizer = create_tokenizer(dataset[:, idx_src])
124
+ src_vocab_size = len(src_tokenizer.word_index) + 1
125
+ src_length = max_len(dataset[:, idx_src])
126
+ printmd(f'\nSource ({source_str}) Vocabulary Size: {src_vocab_size}')
127
+ printmd(f'Source ({source_str}) Max Length: {src_length}\n')
128
+
129
+ # Prepare training data
130
+ trainX = encode_sequences(src_tokenizer, src_length, train[:, idx_src])
131
+ trainY = encode_sequences(tar_tokenizer, tar_length, train[:, idx_tar])
132
+ trainY = encode_output(trainY, tar_vocab_size)
133
+
134
+ # Prepare test data
135
+ testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src])
136
+ testY = encode_sequences(tar_tokenizer, tar_length, test[:, idx_tar])
137
+ testY = encode_output(testY, tar_vocab_size)
138
+
139
+ def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
140
+ # Create the model
141
+ model = Sequential()
142
+ model.add(Embedding(src_vocab_size, n_units, input_length=src_length, mask_zero=True))
143
+ model.add(LSTM(n_units))
144
+ model.add(RepeatVector(tar_timesteps))
145
+ model.add(LSTM(n_units, return_sequences=True))
146
+ model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
147
+ return model
148
+
149
+ # Create model
150
+ model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256)
151
+ model.compile(optimizer='adam', loss='categorical_crossentropy')
152
+
153
+ history = model.fit(trainX,
154
+ trainY,
155
+ epochs=20,
156
+ batch_size=64,
157
+ validation_split=0.1,
158
+ verbose=1,
159
+ callbacks=[
160
+ EarlyStopping(
161
+ monitor='val_loss',
162
+ patience=10,
163
+ restore_best_weights=True
164
+ )
165
+ ])
166
+
167
+ pd.DataFrame(history.history).plot()
168
+ plt.title("Loss")
169
+ plt.show()
170
+
171
+ def word_for_id(integer, tokenizer):
172
+ # map an integer to a word
173
+ for word, index in tokenizer.word_index.items():
174
+ if index == integer:
175
+ return word
176
+ return None
177
+
178
+ def predict_seq(model, tokenizer, source):
179
+ # generate target from a source sequence
180
+ prediction = model.predict(source, verbose=0)[0]
181
+ integers = [np.argmax(vector) for vector in prediction]
182
+ target = list()
183
+ for i in integers:
184
+ word = word_for_id(i, tokenizer)
185
+ if word is None:
186
+ break
187
+ target.append(word)
188
+ return ' '.join(target)
189
+
190
+ def compare_prediction(model, tokenizer, sources, raw_dataset, limit=20):
191
+ # evaluate a model
192
+ actual, predicted = [], []
193
+ src = f'{source_str.upper()} (SOURCE)'
194
+ tgt = f'{target_str.upper()} (TARGET)'
195
+ pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}'
196
+ print(f'{src:30} {tgt:25} {pred}\n')
197
+
198
+ for i, source in enumerate(sources): # translate encoded source text
199
+ source = source.reshape((1, source.shape[0]))
200
+ translation = predict_seq(model, tokenizer, source)
201
+ raw_target, raw_src = raw_dataset[i]
202
+ print(f'{raw_src:30} {raw_target:25} {translation}')
203
+ if i >= limit: # Display some of the result
204
+ break
205
+
206
+ # test on some training sequences
207
+ print('### Result on the Training Set ###')
208
+ compare_prediction(model, tar_tokenizer, trainX, train)
209
+
210
+ # test on some test sequences
211
+ print('\n\n### Result on the Test Set ###')
212
+ compare_prediction(model, tar_tokenizer, testX, test)
213
+
214
+ # It takes long to compute the BLEU Score
215
+
216
+ def bleu_score(model, tokenizer, sources, raw_dataset):
217
+ # Get the bleu score of a model
218
+ actual, predicted = [], []
219
+ for i, source in enumerate(sources):
220
+ # translate encoded source text
221
+ source = source.reshape((1, source.shape[0]))
222
+ translation = predict_seq(model, tar_tokenizer, source)
223
+ raw_target, raw_src = raw_dataset[i]
224
+ actual.append([raw_target.split()])
225
+ predicted.append(translation.split())
226
+
227
+ bleu_dic = {}
228
+ bleu_dic['1-grams'] = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
229
+ bleu_dic['1-2-grams'] = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
230
+ bleu_dic['1-3-grams'] = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
231
+ bleu_dic['1-4-grams'] = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
232
+
233
+ return bleu_dic
234
+
235
+ # Compute the BLEU Score
236
+ bleu_train = bleu_score(model, tar_tokenizer, trainX, train)
237
+ bleu_test = bleu_score(model, tar_tokenizer, testX, test)
238
+
239
+ plt.bar(x = bleu_train.keys(), height = bleu_train.values())
240
+ plt.title("BLEU Score with the training set")
241
+ plt.ylim((0,1))
242
+ plt.show()
243
+
244
+ plt.bar(x = bleu_test.keys(), height = bleu_test.values())
245
+ plt.title("BLEU Score with the test set")
246
+ plt.ylim((0,1))
247
+ plt.show()
248
+
249
+ model.save('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')
250
+
251
+ import gradio as gr
252
+
253
+ # Load the trained model
254
+ model = load_model('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')
255
+
256
+ # Function to translate French to English
257
+ def translate_french_to_english(french_sentence):
258
+ # Clean the input sentence
259
+ french_sentence = clean(french_sentence)
260
+ # Tokenize and pad the input sentence
261
+ input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence])
262
+ # Generate the translation
263
+ english_translation = predict_seq(model, tar_tokenizer, input_sequence)
264
+ return english_translation
265
+
266
+ # Create a Gradio interface
267
+ gr.Interface(
268
+ fn=translate_french_to_english,
269
+ inputs="text",
270
+ outputs="text",
271
+ title="French to English Translator",
272
+ description="Translate French sentences to English."
273
+ ).launch()