alex-ht commited on
Commit
d051fbe
·
1 Parent(s): 7b01696
Files changed (1) hide show
  1. ultravox_processing.py +4 -17
ultravox_processing.py CHANGED
@@ -135,28 +135,11 @@ class UltravoxProcessor(transformers.ProcessorMixin):
135
  data = {}
136
  audio_embed_frames = 0
137
  if audio is not None and len(audio) > 0:
138
- if self.audio_padding == "max_length":
139
- # 30 seconds is the expected length for Whisper
140
- assert sampling_rate is not None, "Sampling rate must be provided."
141
- max_audio_len = 30 * sampling_rate
142
- else:
143
- max_audio_len = max([len(a) for a in audio])
144
-
145
- data["audio_token_len"] = []
146
- for a in audio:
147
- # It's guaranteed that the number of frames is less than or equal to this amount.
148
- # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
149
- # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
150
- nb_encoder_frames = int(round(min(len(a), max_audio_len)/ self.encoder_ds_factor + 1e-4))
151
- audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
152
- data["audio_token_len"].append(audio_embed_frames)
153
-
154
  # Main audio processing. The processor is model-specific.
155
  x = self.audio_processor(
156
  audio,
157
  sampling_rate=sampling_rate,
158
  padding="longest",
159
- max_length=max_audio_len,
160
  return_attention_mask=True,
161
  **kwargs,
162
  )
@@ -165,6 +148,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
165
  else:
166
  data["audio_values"] = x.input_values
167
  data["audio_len"] = x.attention_mask.sum(-1) - 1
 
 
 
 
168
 
169
  if text is not None:
170
  assert isinstance(
 
135
  data = {}
136
  audio_embed_frames = 0
137
  if audio is not None and len(audio) > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  # Main audio processing. The processor is model-specific.
139
  x = self.audio_processor(
140
  audio,
141
  sampling_rate=sampling_rate,
142
  padding="longest",
 
143
  return_attention_mask=True,
144
  **kwargs,
145
  )
 
148
  else:
149
  data["audio_values"] = x.input_values
150
  data["audio_len"] = x.attention_mask.sum(-1) - 1
151
+ def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
152
+ return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
153
+ nb_encoder_frames = [cnn_out_len(cnn_out_len(feat_len, kernel=3), kernel=3, stride=2) for feat_len in data["audio_len"]]
154
+ data["audio_token_len"] = [np.ceil(x/self.stack_factor) for x in nb_encoder_frames]
155
 
156
  if text is not None:
157
  assert isinstance(