Spaces:

Francis0917
/

CL-KWS_202408_v1

Runtime error

App Files Files Community

CL-KWS_202408_v1 / model /ukws_guided_ctc.py

Francis0917

Upload folder using huggingface_hub

2045faa verified 2 months ago

raw

history blame contribute delete

4.84 kB

	import os, sys
	import tensorflow as tf
	import numpy as np
	from tensorflow.keras.models import Model
	from tensorflow.keras import layers
	import datetime
	sys.path.append(os.path.dirname(__file__))
	import encoder, extractor, discriminator, log_melspectrogram, speech_embedding
	from utils import make_feature_matrix as concat_sequence

	seed = 42
	tf.random.set_seed(seed)
	np.random.seed(seed)

	class ukws(Model):
	"""Base class for user-defined kws mdoel"""

	def __init__(self, name="ukws", **kwargs):
	super(ukws, self).__init__(name=name)

	def call(self, speech, text):
	"""
	Args:
	speech : speech feature of shape `(batch, time)`
	text : text embedding of shape `(batch, phoneme)`
	"""
	raise NotImplementedError

	class BaseUKWS(ukws):
	"""Base class for user-defined kws mdoel"""

	def __init__(self, name="BaseUKWS", **kwargs):
	super(BaseUKWS, self).__init__(name=name)
	embedding=128
	self.audio_input = kwargs['audio_input']
	self.text_input = kwargs['text_input']
	self.stack_extractor = kwargs['stack_extractor']

	_stft={
	'frame_length' : kwargs['frame_length'],
	'hop_length' : kwargs['hop_length'],
	'num_mel' : kwargs['num_mel'] ,
	'sample_rate' : kwargs['sample_rate'],
	'log_mel' : kwargs['log_mel'],
	}
	_ae = {
	# [filter, kernel size, stride]
	'conv' : [[embedding, 5, 2], [embedding * 2, 5, 1]],
	# [unit]
	'gru' : [[embedding], [embedding]],
	# fully-connected layer unit
	'fc' : embedding,
	'audio_input' : self.audio_input,
	}
	_te = {
	# fully-connected layer unit
	'fc' : embedding,
	# number of uniq. phonemes
	'vocab' : kwargs['vocab'],
	'text_input' : kwargs['text_input'],
	}
	_ext = {
	# [unit]
	'embedding' : embedding,
	}
	_dis = {
	# [unit]
	'gru' : [[embedding],],
	}
	if self.audio_input == 'both':
	self.SPEC = log_melspectrogram.LogMelgramLayer(**_stft)
	self.EMBD = speech_embedding.GoogleSpeechEmbedder()
	self.AE = encoder.EfficientAudioEncoder(downsample=False, **_ae)
	else:
	if self.audio_input == 'raw':
	self.FEAT = log_melspectrogram.LogMelgramLayer(**_stft)
	elif self.audio_input == 'google_embed':
	self.FEAT = speech_embedding.GoogleSpeechEmbedder()
	self.AE = encoder.AudioEncoder(**_ae)

	self.TE = encoder.TextEncoder(**_te)

	if kwargs['stack_extractor']:
	self.EXT = extractor.StackExtractor(**_ext)
	else:
	self.EXT = extractor.BaseExtractor(**_ext)

	self.DIS = discriminator.BaseDiscriminator(**_dis)

	self.seq_ce_logit = layers.Dense(1, name='sequence_ce')

	def call(self, speech, text):

	"""
	Args:
	speech : speech feature of shape `(batch, time)`
	text : text embedding of shape `(batch, phoneme)`
	"""


	if self.audio_input == 'both':
	s = self.SPEC(speech)
	g = self.EMBD(speech)
	emb_s, LDN = self.AE(s, g)

	else:
	feat = self.FEAT(speech)
	emb_s, LDN = self.AE(feat)
	emb_t = self.TE(text)
	attention_output, affinity_matrix = self.EXT(emb_s, emb_t)
	prob, LD = self.DIS(attention_output)

	n_speech = tf.math.reduce_sum(tf.cast(emb_s._keras_mask, tf.float32), -1)
	if self.stack_extractor:
	n_speech = tf.math.reduce_sum(tf.cast(emb_s._keras_mask, tf.float32), -1)
	n_text = tf.math.reduce_sum(tf.cast(emb_t._keras_mask, tf.float32), -1)
	n_total = n_speech + n_text
	valid_mask = tf.sequence_mask(n_total, maxlen=tf.shape(attention_output)[1], dtype=tf.float32) - tf.sequence_mask(n_speech, maxlen=tf.shape(attention_output)[1], dtype=tf.float32)
	valid_attention_output = tf.ragged.boolean_mask(attention_output, tf.cast(valid_mask, tf.bool)).to_tensor(0.)
	seq_ce_logit = self.seq_ce_logit(valid_attention_output)[:,:,0]
	seq_ce_logit = tf.pad(seq_ce_logit, [[0, 0],[0, tf.shape(emb_t)[1] - tf.shape(seq_ce_logit)[1]]], 'CONSTANT', constant_values=0.)
	seq_ce_logit._keras_mask = emb_t._keras_mask

	else:
	seq_ce_logit = self.seq_ce_logit(attention_output)[:,:,0]
	seq_ce_logit._keras_mask = attention_output._keras_mask


	return prob, affinity_matrix, LD, seq_ce_logit, n_speech