alex-ht
commited on
Commit
·
d051fbe
1
Parent(s):
7b01696
fix
Browse files- ultravox_processing.py +4 -17
ultravox_processing.py
CHANGED
@@ -135,28 +135,11 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
135 |
data = {}
|
136 |
audio_embed_frames = 0
|
137 |
if audio is not None and len(audio) > 0:
|
138 |
-
if self.audio_padding == "max_length":
|
139 |
-
# 30 seconds is the expected length for Whisper
|
140 |
-
assert sampling_rate is not None, "Sampling rate must be provided."
|
141 |
-
max_audio_len = 30 * sampling_rate
|
142 |
-
else:
|
143 |
-
max_audio_len = max([len(a) for a in audio])
|
144 |
-
|
145 |
-
data["audio_token_len"] = []
|
146 |
-
for a in audio:
|
147 |
-
# It's guaranteed that the number of frames is less than or equal to this amount.
|
148 |
-
# For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
|
149 |
-
# Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
|
150 |
-
nb_encoder_frames = int(round(min(len(a), max_audio_len)/ self.encoder_ds_factor + 1e-4))
|
151 |
-
audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
|
152 |
-
data["audio_token_len"].append(audio_embed_frames)
|
153 |
-
|
154 |
# Main audio processing. The processor is model-specific.
|
155 |
x = self.audio_processor(
|
156 |
audio,
|
157 |
sampling_rate=sampling_rate,
|
158 |
padding="longest",
|
159 |
-
max_length=max_audio_len,
|
160 |
return_attention_mask=True,
|
161 |
**kwargs,
|
162 |
)
|
@@ -165,6 +148,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
165 |
else:
|
166 |
data["audio_values"] = x.input_values
|
167 |
data["audio_len"] = x.attention_mask.sum(-1) - 1
|
|
|
|
|
|
|
|
|
168 |
|
169 |
if text is not None:
|
170 |
assert isinstance(
|
|
|
135 |
data = {}
|
136 |
audio_embed_frames = 0
|
137 |
if audio is not None and len(audio) > 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
# Main audio processing. The processor is model-specific.
|
139 |
x = self.audio_processor(
|
140 |
audio,
|
141 |
sampling_rate=sampling_rate,
|
142 |
padding="longest",
|
|
|
143 |
return_attention_mask=True,
|
144 |
**kwargs,
|
145 |
)
|
|
|
148 |
else:
|
149 |
data["audio_values"] = x.input_values
|
150 |
data["audio_len"] = x.attention_mask.sum(-1) - 1
|
151 |
+
def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
|
152 |
+
return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
|
153 |
+
nb_encoder_frames = [cnn_out_len(cnn_out_len(feat_len, kernel=3), kernel=3, stride=2) for feat_len in data["audio_len"]]
|
154 |
+
data["audio_token_len"] = [np.ceil(x/self.stack_factor) for x in nb_encoder_frames]
|
155 |
|
156 |
if text is not None:
|
157 |
assert isinstance(
|