AhmedSSabir commited on
Commit
8a37338
1 Parent(s): 365b962

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +397 -1
README.md CHANGED
@@ -80,4 +80,400 @@ y_out = sess.run(y, feed_dict={
80
 
81
 
82
  print(y_out)
83
- ````
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  print(y_out)
83
+ ````
84
+
85
+ For training and inference
86
+
87
+
88
+ ```python
89
+
90
+ # -*- coding: utf-8 -*-
91
+ #!/bin/env python
92
+ import sys
93
+ import argparse
94
+ import re
95
+ import os
96
+ import sys
97
+ import json
98
+
99
+ import logging
100
+ import numpy as np
101
+ import pandas as pd
102
+ import tensorflow as tf
103
+ import tensorflow_hub as hub
104
+ from BertLayer import BertLayer
105
+ from BertLayer import build_preprocessor
106
+ from freeze_keras_model import freeze_keras_model
107
+
108
+ from data_pre import *
109
+ from tensorflow import keras
110
+ from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
111
+ from sklearn.model_selection import train_test_split
112
+
113
+
114
+ if not 'bert_repo' in sys.path:
115
+ sys.path.insert(0, 'bert_repo')
116
+
117
+ from modeling import BertModel, BertConfig
118
+ from tokenization import FullTokenizer, convert_to_unicode
119
+ from extract_features import InputExample, convert_examples_to_features
120
+
121
+
122
+ # get TF logger
123
+ log = logging.getLogger('tensorflow')
124
+ log.handlers = []
125
+
126
+
127
+ parser=argparse.ArgumentParser()
128
+ parser.add_argument('--train', default='train.tsv', help='beam serach', type=str,required=False)
129
+ parser.add_argument('--num_bert_layer', default='12', help='truned layers', type=int,required=False)
130
+ parser.add_argument('--batch_size', default='128', help='truned layers', type=int,required=False)
131
+ parser.add_argument('--epochs', default='5', help='', type=int,required=False)
132
+ parser.add_argument('--seq_len', default='64', help='', type=int,required=False)
133
+ parser.add_argument('--CNN_kernel_size', default='3', help='', type=int,required=False)
134
+ parser.add_argument('--CNN_filters', default='32', help='', type=int,required=False)
135
+ args = parser.parse_args()
136
+
137
+
138
+ # Downlaod the pre-trained model
139
+
140
+ #!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
141
+ #!unzip uncased_L-12_H-768_A-12.zip
142
+
143
+
144
+ # tf.Module
145
+ def build_module_fn(config_path, vocab_path, do_lower_case=True):
146
+
147
+ def bert_module_fn(is_training):
148
+ """Spec function for a token embedding module."""
149
+
150
+ input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")
151
+ input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask")
152
+ token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids")
153
+
154
+ config = BertConfig.from_json_file(config_path)
155
+ model = BertModel(config=config, is_training=is_training,
156
+ input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type)
157
+
158
+ seq_output = model.all_encoder_layers[-1]
159
+ pool_output = model.get_pooled_output()
160
+
161
+ config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
162
+ vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
163
+ lower_case = tf.constant(do_lower_case)
164
+
165
+ tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
166
+ tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)
167
+
168
+ input_map = {"input_ids": input_ids,
169
+ "input_mask": input_mask,
170
+ "segment_ids": token_type}
171
+
172
+ output_map = {"pooled_output": pool_output,
173
+ "sequence_output": seq_output}
174
+
175
+ output_info_map = {"vocab_file": vocab_file,
176
+ "do_lower_case": lower_case}
177
+
178
+ hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
179
+ hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
180
+
181
+ return bert_module_fn
182
+
183
+
184
+ #MODEL_DIR = "uncased_L-12_H-768_A-12"
185
+ config_path = "/{}/bert_config.json".format(MODEL_DIR)
186
+ vocab_path = "/{}/vocab.txt".format(MODEL_DIR)
187
+
188
+
189
+ tags_and_args = []
190
+ for is_training in (True, False):
191
+ tags = set()
192
+ if is_training:
193
+ tags.add("train")
194
+ tags_and_args.append((tags, dict(is_training=is_training)))
195
+
196
+ module_fn = build_module_fn(config_path, vocab_path)
197
+ spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args)
198
+ spec.export("bert-module",
199
+ checkpoint_path="/{}/bert_model.ckpt".format(MODEL_DIR))
200
+
201
+ class BertLayer(tf.keras.layers.Layer):
202
+ def __init__(self, bert_path, seq_len=64, n_tune_layers=3,
203
+ pooling="cls", do_preprocessing=True, verbose=False,
204
+ tune_embeddings=False, trainable=True, **kwargs):
205
+
206
+ self.trainable = trainable
207
+ self.n_tune_layers = n_tune_layers
208
+ self.tune_embeddings = tune_embeddings
209
+ self.do_preprocessing = do_preprocessing
210
+
211
+ self.verbose = verbose
212
+ self.seq_len = seq_len
213
+ self.pooling = pooling
214
+ self.bert_path = bert_path
215
+
216
+ self.var_per_encoder = 16
217
+ if self.pooling not in ["cls", "mean", None]:
218
+ raise NameError(
219
+ f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}"
220
+ )
221
+
222
+ super(BertLayer, self).__init__(**kwargs)
223
+
224
+ def build(self, input_shape):
225
+
226
+ self.bert = hub.Module(self.build_abspath(self.bert_path),
227
+ trainable=self.trainable, name=f"{self.name}_module")
228
+
229
+ trainable_layers = []
230
+ if self.tune_embeddings:
231
+ trainable_layers.append("embeddings")
232
+
233
+ if self.pooling == "cls":
234
+ trainable_layers.append("pooler")
235
+
236
+ if self.n_tune_layers > 0:
237
+ encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
238
+ n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
239
+ for i in range(self.n_tune_layers):
240
+ trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
241
+
242
+ # Add module variables to layer's trainable weights
243
+ for var in self.bert.variables:
244
+ if any([l in var.name for l in trainable_layers]):
245
+ self._trainable_weights.append(var)
246
+ else:
247
+ self._non_trainable_weights.append(var)
248
+
249
+ if self.verbose:
250
+ print("*** TRAINABLE VARS *** ")
251
+ for var in self._trainable_weights:
252
+ print(var)
253
+
254
+ self.build_preprocessor()
255
+ self.initialize_module()
256
+
257
+ super(BertLayer, self).build(input_shape)
258
+
259
+ def build_abspath(self, path):
260
+ if path.startswith("https://") or path.startswith("gs://"):
261
+ return path
262
+ else:
263
+ return os.path.abspath(path)
264
+
265
+ def build_preprocessor(self):
266
+ sess = tf.keras.backend.get_session()
267
+ tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
268
+ vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
269
+ tokenization_info["do_lower_case"]])
270
+ self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
271
+
272
+ def initialize_module(self):
273
+ sess = tf.keras.backend.get_session()
274
+
275
+ vars_initialized = sess.run([tf.is_variable_initialized(var)
276
+ for var in self.bert.variables])
277
+
278
+ uninitialized = []
279
+ for var, is_initialized in zip(self.bert.variables, vars_initialized):
280
+ if not is_initialized:
281
+ uninitialized.append(var)
282
+
283
+ if len(uninitialized):
284
+ sess.run(tf.variables_initializer(uninitialized))
285
+
286
+ def call(self, input):
287
+
288
+ if self.do_preprocessing:
289
+ input = tf.numpy_function(self.preprocessor,
290
+ [input], [tf.int32, tf.int32, tf.int32],
291
+ name='preprocessor')
292
+ for feature in input:
293
+ feature.set_shape((None, self.seq_len))
294
+
295
+ input_ids, input_mask, segment_ids = input
296
+
297
+ bert_inputs = dict(
298
+ input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
299
+ )
300
+ output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
301
+
302
+ if self.pooling == "cls":
303
+ pooled = output["pooled_output"]
304
+ else:
305
+ result = output["sequence_output"]
306
+
307
+ input_mask = tf.cast(input_mask, tf.float32)
308
+ mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
309
+ masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
310
+ tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
311
+
312
+ if self.pooling == "mean":
313
+ pooled = masked_reduce_mean(result, input_mask)
314
+ else:
315
+ pooled = mul_mask(result, input_mask)
316
+
317
+ return pooled
318
+
319
+ def get_config(self):
320
+ config_dict = {
321
+ "bert_path": self.bert_path,
322
+ "seq_len": self.seq_len,
323
+ "pooling": self.pooling,
324
+ "n_tune_layers": self.n_tune_layers,
325
+ "tune_embeddings": self.tune_embeddings,
326
+ "do_preprocessing": self.do_preprocessing,
327
+ "verbose": self.verbose
328
+ }
329
+ super(BertLayer, self).get_config()
330
+ return config_dict
331
+
332
+
333
+ # read the train data
334
+ df = pd.read_csv(args.train, sep='\t')
335
+
336
+
337
+ #labels = df.is_duplicate.values
338
+ labels = df.is_related.values
339
+
340
+ texts = []
341
+ delimiter = " ||| "
342
+
343
+ for vis, cap in zip(df.visual.tolist(), df.caption.tolist()):
344
+ texts.append(delimiter.join((str(vis), str(cap))))
345
+
346
+
347
+ texts = np.array(texts)
348
+
349
+ trX, tsX, trY, tsY = train_test_split(texts, labels, shuffle=True, test_size=0.2)
350
+
351
+
352
+ # Buliding the model
353
+
354
+ embedding_size = 768
355
+
356
+ # input
357
+ inp = tf.keras.Input(shape=(1,), dtype=tf.string)
358
+
359
+ # BERT encoder
360
+ # For CLS with linear layer
361
+ #encoder = BertLayer(bert_path="./bert-module/", seq_len=48, tune_embeddings=False,
362
+ # pooling='cls', n_tune_layers=3, verbose=False)
363
+
364
+
365
+ # CNN Layers
366
+ encoder = BertLayer(bert_path="./bert-module/", seq_len=args.seq_len, tune_embeddings=False, pooling=None, n_tune_layers=args.num_bert_layer, verbose=False)
367
+ cnn_out = tf.keras.layers.Conv1D(args.CNN_filters, args.CNN_kernel_size, padding='VALID', activation=tf.nn.relu)(encoder(inp))
368
+ pool = tf.keras.layers.MaxPooling1D(pool_size=2)(cnn_out)
369
+ flat = tf.keras.layers.Flatten()(pool)
370
+ pred = tf.keras.layers.Dense(1, activation="sigmoid")(flat)
371
+
372
+
373
+ model = tf.keras.models.Model(inputs=[inp], outputs=[pred])
374
+
375
+ model.summary()
376
+
377
+ model.compile(
378
+ optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, ),
379
+ loss="binary_crossentropy",
380
+ metrics=["accuracy"])
381
+
382
+ # fit the data
383
+ import logging
384
+ logging.getLogger("tensorflow").setLevel(logging.WARNING)
385
+
386
+ saver = keras.callbacks.ModelCheckpoint("bert_CNN_tuned.hdf5")
387
+
388
+ model.fit(trX, trY, validation_data=[tsX, tsY], batch_size=args.batch_size, epochs=args.epochs, callbacks=[saver])
389
+
390
+ #save the model
391
+ model.predict(trX[:10])
392
+
393
+ import json
394
+ json.dump(model.to_json(), open("model.json", "w"))
395
+
396
+ model = tf.keras.models.model_from_json(json.load(open("model.json")),
397
+ custom_objects={"BertLayer": BertLayer})
398
+
399
+ model.load_weights("bert_CNN_tuned.hdf5")
400
+
401
+ model.predict(trX[:10])
402
+
403
+ # For fast inference and less RAM usesage as post-processing we need to "freezing" the model.
404
+ from tensorflow.python.framework.graph_util import convert_variables_to_constants
405
+ from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
406
+
407
+ def freeze_keras_model(model, export_path=None, clear_devices=True):
408
+ sess = tf.keras.backend.get_session()
409
+ graph = sess.graph
410
+
411
+ with graph.as_default():
412
+
413
+ input_tensors = model.inputs
414
+ output_tensors = model.outputs
415
+ dtypes = [t.dtype.as_datatype_enum for t in input_tensors]
416
+ input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors]
417
+ output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors]
418
+
419
+ tmp_g = graph.as_graph_def()
420
+ if clear_devices:
421
+ for node in tmp_g.node:
422
+ node.device = ""
423
+
424
+ tmp_g = optimize_for_inference(
425
+ tmp_g, input_ops, output_ops, dtypes, False)
426
+
427
+ tmp_g = convert_variables_to_constants(sess, tmp_g, output_ops)
428
+
429
+ if export_path is not None:
430
+ with tf.gfile.GFile(export_path, "wb") as f:
431
+ f.write(tmp_g.SerializeToString())
432
+
433
+ return tmp_g
434
+
435
+
436
+ # freeze and save the model
437
+ frozen_graph = freeze_keras_model(model, export_path="frozen_graph.pb")
438
+
439
+
440
+ # inference
441
+ #!git clone https://github.com/gaphex/bert_experimental/
442
+
443
+ import tensorflow as tf
444
+ import numpy as np
445
+ import sys
446
+
447
+ sys.path.insert(0, "bert_experimental")
448
+
449
+ from bert_experimental.finetuning.text_preprocessing import build_preprocessor
450
+ from bert_experimental.finetuning.graph_ops import load_graph
451
+
452
+
453
+ restored_graph = load_graph("frozen_graph.pb")
454
+ graph_ops = restored_graph.get_operations()
455
+ input_op, output_op = graph_ops[0].name, graph_ops[-1].name
456
+ print(input_op, output_op)
457
+
458
+ x = restored_graph.get_tensor_by_name(input_op + ':0')
459
+ y = restored_graph.get_tensor_by_name(output_op + ':0')
460
+
461
+
462
+ preprocessor = build_preprocessor("vocab.txt", 64)
463
+ py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32], name='preprocessor')
464
+
465
+ py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32])
466
+
467
+ # predictions
468
+ sess = tf.Session(graph=restored_graph)
469
+
470
+ trX[:10]
471
+
472
+ y_out = sess.run(y, feed_dict={
473
+ x: trX[:10].reshape((-1,1))
474
+ })
475
+
476
+ print(y_out)
477
+
478
+
479
+ ```