Spaces:

Francis0917
/

CL-KWS_202408_v1

Runtime error

CL-KWS_202408_v1

File size: 4,843 Bytes

2045faa

import os, sys
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import datetime
sys.path.append(os.path.dirname(__file__))
import encoder, extractor, discriminator, log_melspectrogram, speech_embedding
from utils import make_feature_matrix as concat_sequence

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

class ukws(Model):
    """Base class for user-defined kws mdoel"""
    
    def __init__(self, name="ukws", **kwargs):
        super(ukws, self).__init__(name=name)

    def call(self, speech, text):
        """
        Args:
            speech  : speech feature of shape `(batch, time)`
            text    : text embedding of shape `(batch, phoneme)`
        """
        raise NotImplementedError

class BaseUKWS(ukws):
    """Base class for user-defined kws mdoel"""
    
    def __init__(self, name="BaseUKWS", **kwargs):
        super(BaseUKWS, self).__init__(name=name)
        embedding=128
        self.audio_input = kwargs['audio_input']
        self.text_input = kwargs['text_input']
        self.stack_extractor = kwargs['stack_extractor']
        
        _stft={
            'frame_length' : kwargs['frame_length'], 
            'hop_length' : kwargs['hop_length'], 
            'num_mel'  : kwargs['num_mel'] ,
            'sample_rate' : kwargs['sample_rate'],
            'log_mel' : kwargs['log_mel'],
        }
        _ae = {
            # [filter, kernel size, stride]
            'conv' : [[embedding, 5, 2], [embedding * 2, 5, 1]],
            # [unit]
            'gru' : [[embedding], [embedding]],
            # fully-connected layer unit
            'fc' : embedding,
            'audio_input' : self.audio_input,
        }
        _te = {
            # fully-connected layer unit
            'fc' : embedding,
            # number of uniq. phonemes
            'vocab' : kwargs['vocab'],
            'text_input' : kwargs['text_input'],
        }
        _ext = {
            # [unit]
            'embedding' : embedding,
        }
        _dis = {
            # [unit]
            'gru' : [[embedding],],
        }
        if self.audio_input == 'both':
            self.SPEC = log_melspectrogram.LogMelgramLayer(**_stft)
            self.EMBD = speech_embedding.GoogleSpeechEmbedder()
            self.AE = encoder.EfficientAudioEncoder(downsample=False, **_ae)
        else:
            if self.audio_input == 'raw':
                self.FEAT = log_melspectrogram.LogMelgramLayer(**_stft)
            elif self.audio_input == 'google_embed':
                self.FEAT = speech_embedding.GoogleSpeechEmbedder()
            self.AE = encoder.AudioEncoder(**_ae)

        self.TE = encoder.TextEncoder(**_te)
        
        if kwargs['stack_extractor']:
            self.EXT = extractor.StackExtractor(**_ext)
        else:
            self.EXT = extractor.BaseExtractor(**_ext)
        
        self.DIS = discriminator.BaseDiscriminator(**_dis)
        
        self.seq_ce_logit = layers.Dense(1, name='sequence_ce')
        
    def call(self, speech, text):
        
        """
        Args:
            speech      : speech feature of shape `(batch, time)`
            text        : text embedding of shape `(batch, phoneme)`
        """
        
        
        if self.audio_input == 'both':
            s = self.SPEC(speech)
            g = self.EMBD(speech)
            emb_s, LDN = self.AE(s, g)
            
        else:            
            feat = self.FEAT(speech)
            emb_s, LDN = self.AE(feat)
        emb_t = self.TE(text)
        attention_output, affinity_matrix = self.EXT(emb_s, emb_t)
        prob, LD = self.DIS(attention_output)
        
        n_speech = tf.math.reduce_sum(tf.cast(emb_s._keras_mask, tf.float32), -1)
        if self.stack_extractor:
            n_speech = tf.math.reduce_sum(tf.cast(emb_s._keras_mask, tf.float32), -1)
            n_text = tf.math.reduce_sum(tf.cast(emb_t._keras_mask, tf.float32), -1)
            n_total = n_speech + n_text
            valid_mask = tf.sequence_mask(n_total, maxlen=tf.shape(attention_output)[1], dtype=tf.float32) - tf.sequence_mask(n_speech, maxlen=tf.shape(attention_output)[1], dtype=tf.float32)
            valid_attention_output = tf.ragged.boolean_mask(attention_output, tf.cast(valid_mask, tf.bool)).to_tensor(0.)
            seq_ce_logit = self.seq_ce_logit(valid_attention_output)[:,:,0]
            seq_ce_logit = tf.pad(seq_ce_logit, [[0, 0],[0, tf.shape(emb_t)[1] - tf.shape(seq_ce_logit)[1]]], 'CONSTANT', constant_values=0.)
            seq_ce_logit._keras_mask = emb_t._keras_mask

        else:
            seq_ce_logit = self.seq_ce_logit(attention_output)[:,:,0]
            seq_ce_logit._keras_mask = attention_output._keras_mask
        
        
        return prob, affinity_matrix, LD, seq_ce_logit, n_speech