AlexHung29629
/

test_mllama_v12

Feature Extraction

Model card Files Files and versions Community

alex-ht commited on Nov 25, 2024

Commit

d051fbe

·

1 Parent(s): 7b01696

fix

Files changed (1) hide show

ultravox_processing.py +4 -17

ultravox_processing.py CHANGED Viewed

@@ -135,28 +135,11 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         data = {}
         audio_embed_frames = 0
         if audio is not None and len(audio) > 0:
-            if self.audio_padding == "max_length":
-                # 30 seconds is the expected length for Whisper
-                assert sampling_rate is not None, "Sampling rate must be provided."
-                max_audio_len = 30 * sampling_rate
-            else:
-                max_audio_len = max([len(a) for a in audio])
-            data["audio_token_len"] = []
-            for a in audio:
-                # It's guaranteed that the number of frames is less than or equal to this amount.
-                # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
-                # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
-                nb_encoder_frames = int(round(min(len(a), max_audio_len)/ self.encoder_ds_factor + 1e-4))
-                audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
-                data["audio_token_len"].append(audio_embed_frames)
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
                 audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
-                max_length=max_audio_len,
                 return_attention_mask=True,
                 **kwargs,
             )
@@ -165,6 +148,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             else:
                 data["audio_values"] = x.input_values
             data["audio_len"] = x.attention_mask.sum(-1) - 1
         if text is not None:
             assert isinstance(

         data = {}
         audio_embed_frames = 0
         if audio is not None and len(audio) > 0:
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
                 audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
                 return_attention_mask=True,
                 **kwargs,
             )
             else:
                 data["audio_values"] = x.input_values
             data["audio_len"] = x.attention_mask.sum(-1) - 1
+            def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
+                return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
+            nb_encoder_frames = [cnn_out_len(cnn_out_len(feat_len, kernel=3), kernel=3, stride=2) for feat_len in data["audio_len"]]
+            data["audio_token_len"] = [np.ceil(x/self.stack_factor) for x in nb_encoder_frames]
         if text is not None:
             assert isinstance(