crobbi commited on
Commit
8a0213d
Β·
1 Parent(s): 0474dc3

Upload 5 files

Browse files
Files changed (5) hide show
  1. animation.gif +0 -0
  2. modelutil.py +34 -0
  3. streamlitapp.py +60 -0
  4. test_video.mp4 +0 -0
  5. utils.py +52 -0
animation.gif ADDED
modelutil.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tensorflow.python.ops.numpy_ops import np_config
2
+ np_config.enable_numpy_behavior()
3
+ import os
4
+ from tensorflow.keras.models import Sequential
5
+ from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
6
+
7
+ def load_model() -> Sequential:
8
+ model = Sequential()
9
+
10
+ model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
11
+ model.add(Activation('relu'))
12
+ model.add(MaxPool3D((1,2,2)))
13
+
14
+ model.add(Conv3D(256, 3, padding='same'))
15
+ model.add(Activation('relu'))
16
+ model.add(MaxPool3D((1,2,2)))
17
+
18
+ model.add(Conv3D(75, 3, padding='same'))
19
+ model.add(Activation('relu'))
20
+ model.add(MaxPool3D((1,2,2)))
21
+
22
+ model.add(TimeDistributed(Flatten()))
23
+
24
+ model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
25
+ model.add(Dropout(.5))
26
+
27
+ model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
28
+ model.add(Dropout(.5))
29
+
30
+ model.add(Dense(41, kernel_initializer='he_normal', activation='softmax'))
31
+ # print("path",os.path.join('..','models','checkpoint'))
32
+ model.load_weights(os.path.join('..','models','checkpoint'))
33
+
34
+ return model
streamlitapp.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import all of the dependencies
2
+ import streamlit as st
3
+ import os
4
+ import imageio
5
+ import numpy as np
6
+
7
+ import tensorflow as tf
8
+ from utils import load_data, num_to_char
9
+ from modelutil import load_model
10
+
11
+
12
+ # Set the layout to the streamlit app as wide
13
+ st.set_page_config(layout='wide')
14
+
15
+ # Setup the sidebar
16
+ with st.sidebar:
17
+ st.image('https://plus.unsplash.com/premium_photo-1682309676673-392c56015c5c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80')
18
+ st.title('Lip Reading')
19
+ st.info('This application is originally developed from the LipNet deep learning model.')
20
+
21
+ st.title('LipNet using StreamLit ✌🏻')
22
+ # Generating a list of options or videos
23
+ options = os.listdir(os.path.join('..', 'data', 's1'))
24
+ selected_video = st.selectbox('Choose video', options)
25
+
26
+ # Generate two columns
27
+ col1, col2 = st.columns(2)
28
+
29
+ if options:
30
+
31
+ # Rendering the video
32
+ with col1:
33
+ st.info('The video below displays the converted video in mp4 format')
34
+ file_path = os.path.join('..','data','s1', selected_video)
35
+ os.system(f'ffmpeg -i {file_path} -vcodec libx264 test_video.mp4 -y')
36
+
37
+ # Rendering inside of the app
38
+ video = open('test_video.mp4', 'rb')
39
+ video_bytes = video.read()
40
+ st.video(video_bytes)
41
+
42
+
43
+ with col2:
44
+ st.info('πŸ‘€ This is all the machine learning model sees when making a prediction')
45
+ video, annotations,image_data = load_data(tf.convert_to_tensor(file_path))
46
+ # st.text(video.shape)
47
+ imageio.mimsave('animation.gif',np.squeeze((video * 50).astype(np.uint8)) , duration=100)
48
+ st.image('animation.gif', width=400)
49
+
50
+ st.info('This is the output of the machine learning model as tokens')
51
+ model = load_model()
52
+ yhat = model.predict(tf.expand_dims(video, axis=0))
53
+ decoder = tf.keras.backend.ctc_decode(yhat, [75], greedy=True)[0][0].numpy()
54
+ st.text(decoder)
55
+
56
+ # Convert prediction to text
57
+ st.info('Decode the raw tokens into words')
58
+ converted_prediction = tf.strings.reduce_join(num_to_char(decoder)).numpy().decode('utf-8')
59
+ st.text(converted_prediction)
60
+
test_video.mp4 ADDED
Binary file (110 kB). View file
 
utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from typing import List
3
+ import numpy as np
4
+ import cv2
5
+ import os
6
+
7
+ vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
8
+ char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
9
+ # Mapping integers back to original characters
10
+ num_to_char = tf.keras.layers.StringLookup(
11
+ vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
12
+ )
13
+
14
+ def load_video(path:str) -> List[float]:
15
+ #print(path)
16
+ cap = cv2.VideoCapture(path)
17
+ frames = []
18
+ for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
19
+ ret, frame = cap.read()
20
+ frame = tf.image.rgb_to_grayscale(frame)
21
+ frames.append(frame[190:236,80:220,:])
22
+ cap.release()
23
+
24
+ mean = tf.math.reduce_mean(frames)
25
+ std = tf.math.reduce_std(tf.cast(frames, tf.float32))
26
+ return tf.cast((frames - mean), tf.float32) / std
27
+
28
+ def load_alignments(path:str) -> List[str]:
29
+ #print(path)
30
+ with open(path, 'r') as f:
31
+ lines = f.readlines()
32
+ tokens = []
33
+ for line in lines:
34
+ line = line.split()
35
+ if line[2] != 'sil':
36
+ tokens = [*tokens,' ',line[2]]
37
+ return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]
38
+
39
+ def load_data(path: str):
40
+ path = bytes.decode(path.numpy())
41
+ file_name = path.split('/')[-1].split('.')[0]
42
+ # File name splitting for windows
43
+ file_name = path.split('\\')[-1].split('.')[0]
44
+ video_path = os.path.join('..','data','s1',f'{file_name}.mpg')
45
+ alignment_path = os.path.join('..','data','alignments','s1',f'{file_name}.align')
46
+ frames = load_video(video_path)
47
+ print(frames.shape)
48
+ alignments = load_alignments(alignment_path)
49
+ image_data = (frames * 255).astype(np.uint8)
50
+ image_data = np.squeeze(image_data)
51
+
52
+ return frames, alignments, image_data