import os import io os.environ["KERAS_BACKEND"] = "tensorflow" import keras import numpy as np from PIL import Image import gradio as gr import tensorflow as tf from keras import layers from pathlib import Path from collections import Counter def ctc_batch_cost(y_true, y_pred, input_length, label_length): label_length = tf.cast(tf.squeeze(label_length, axis=-1), tf.int32) input_length = tf.cast(tf.squeeze(input_length, axis=-1), tf.int32) sparse_labels = tf.cast(ctc_label_dense_to_sparse(y_true, label_length), tf.int32) y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon()) return tf.expand_dims( tf.compat.v1.nn.ctc_loss( inputs=y_pred, labels=sparse_labels, sequence_length=input_length ), 1, ) def ctc_label_dense_to_sparse(labels, label_lengths): label_shape = tf.shape(labels) num_batches_tns = tf.stack([label_shape[0]]) max_num_labels_tns = tf.stack([label_shape[1]]) def range_less_than(old_input, current_input): return tf.expand_dims(tf.range(tf.shape(old_input)[1]), 0) < tf.fill( max_num_labels_tns, current_input ) init = tf.cast(tf.fill([1, label_shape[1]], 0), tf.bool) dense_mask = tf.compat.v1.scan( range_less_than, label_lengths, initializer=init, parallel_iterations=1 ) dense_mask = dense_mask[:, 0, :] label_array = tf.reshape( tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape ) label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask) batch_array = tf.transpose( tf.reshape( tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0]), ) ) batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask) indices = tf.transpose( tf.reshape(tf.concat([batch_ind, label_ind], axis=0), [2, -1]) ) vals_sparse = tf.compat.v1.gather_nd(labels, indices) return tf.SparseTensor( tf.cast(indices, tf.int64), vals_sparse, tf.cast(label_shape, tf.int64) ) class CTCLayer(layers.Layer): def __init__(self, name=None): super().__init__(name=name) self.loss_fn = ctc_batch_cost def call(self, y_true, y_pred): # Compute the training-time loss value and add it to the layer using `self.add_loss()`. batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") label_length = tf.cast(tf.shape(y_true)[1], dtype="int64") input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") loss = self.loss_fn(y_true, y_pred, input_length, label_length) self.add_loss(loss) # At test time, just return the computed predictions return y_pred loaded_model = keras.models.load_model("ocr_model_pred.keras", custom_objects={"CTCLayer": CTCLayer}) loaded_model.load_weights("ocr_model_pred_weights.h5") max_len = 5 characters = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] # mapping characters to integers char_to_num = layers.StringLookup(vocabulary=list(characters), mask_token=None) # Mapping integers back to original characters num_to_char = layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True ) image_width = 128 image_height = 32 def distortion_free_resize(image, img_size): w, h = img_size image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) image = tf.image.resize_with_pad(image, target_height=h, target_width=w) image = tf.transpose(image, perm=[1, 0, 2]) image = tf.image.flip_left_right(image) return image def decode_batch_predictions(input_image, img_size=(image_width, image_height)): img_byte_array = io.BytesIO() input_image.save(img_byte_array, format='JPEG') # Change the format as needed input_image = img_byte_array.getvalue() input_image = tf.io.decode_image(input_image, channels=1, dtype=tf.dtypes.uint8) input_image = distortion_free_resize(input_image, img_size) input_image = tf.expand_dims(input_image, axis=0) input_image = tf.image.convert_image_dtype(input_image, tf.float32)/255.0 pred = loaded_model.predict(input_image) input_len = np.ones(pred.shape[0]) * pred.shape[1] results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][ :, :max_len ] # Iterate over the results and get back the text. output_text = [] for res in results: res = tf.gather(res, tf.where(tf.math.not_equal(res, -1))) res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8") output_text.append(res) return output_text interface = gr.Interface(fn=decode_batch_predictions, inputs=gr.Image(label="Input image", type="pil"), outputs='text',title='Captcha Recognition', theme='darkhuggingface') interface.launch(inline=False)