AlexHung29629
commited on
Commit
•
2c3c2d2
1
Parent(s):
f29a23c
Update ultravox_processing.py
Browse files- ultravox_processing.py +4 -1
ultravox_processing.py
CHANGED
@@ -150,8 +150,11 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
150 |
data["audio_len"] = x.attention_mask.sum(-1) - 1
|
151 |
def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
|
152 |
return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
|
|
|
|
|
|
|
153 |
nb_encoder_frames = [cnn_out_len(cnn_out_len(feat_len, kernel=3), kernel=3, stride=2) for feat_len in data["audio_len"]]
|
154 |
-
data["audio_token_len"] = [
|
155 |
|
156 |
if text is not None:
|
157 |
assert isinstance(
|
|
|
150 |
data["audio_len"] = x.attention_mask.sum(-1) - 1
|
151 |
def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
|
152 |
return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
|
153 |
+
def stack_frame_len(T):
|
154 |
+
T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
|
155 |
+
return int((T_pad + self.stack_factor) // self.stack_factor)
|
156 |
nb_encoder_frames = [cnn_out_len(cnn_out_len(feat_len, kernel=3), kernel=3, stride=2) for feat_len in data["audio_len"]]
|
157 |
+
data["audio_token_len"] = [stack_frame_len(x) for x in nb_encoder_frames]
|
158 |
|
159 |
if text is not None:
|
160 |
assert isinstance(
|