AlexHung29629
/

test_mllama_v12

Feature Extraction

Transformers

Safetensors

ultravox

custom_code

Model card Files Files and versions Community

alex-ht commited on 25 days ago

Commit

91d53d9

•

1 Parent(s): 0524ea2

batch

Browse files

Files changed (1) hide show

ultravox_processing.py +38 -35

ultravox_processing.py CHANGED Viewed

@@ -138,22 +138,25 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             if self.audio_padding == "max_length":
                 # 30 seconds is the expected length for Whisper
                 assert sampling_rate is not None, "Sampling rate must be provided."
-                audio_len = 30 * sampling_rate
             else:
-                audio_len = max([len(a) for a in audio])
-            # It's guaranteed that the number of frames is less than or equal to this amount.
-            # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
-            # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
-            nb_encoder_frames = int(round(audio_len / self.encoder_ds_factor + 1e-4))
-            audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
-            data["audio_token_len"] = [audio_embed_frames]
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
                 audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
-                max_length=audio_len,
                 return_attention_mask=True,
                 **kwargs,
             )
@@ -161,39 +164,39 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 data["audio_values"] = x.input_features
             else:
                 data["audio_values"] = x.input_values
-            if self.audio_padding == "max_length":
-                data["audio_len"] = x.attention_mask.sum(-1) - 1
-            else:
-                data["audio_len"] = [data["audio_values"].shape[-1]]
         if text is not None:
             assert isinstance(
-                text, str
-            ), "Text must be a string. Batch mode not supported yet."
-            if self.audio_placeholder in text:
-                if "audio_token_len" not in data:
-                    raise ValueError(
-                        f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
                     )
-                start_idx = len(
-                    self.tokenizer.encode(
-                        text[: text.index(self.audio_placeholder)],
-                        add_special_tokens=False,
                     )
-                )
-                data["audio_token_start_idx"] = [start_idx]
-                # Replace the audio placeholder with the audio token.
-                #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
-                #        where the number of </s> is the number of audio frames.
-                text = text.replace(
-                    self.audio_placeholder,
-                    self.audio_token_replacement * audio_embed_frames,
-                )
             # Special tokens like BOS should already have been added by the caller.
-            data.update(self.tokenizer([text], add_special_tokens=False, **kwargs))
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)

             if self.audio_padding == "max_length":
                 # 30 seconds is the expected length for Whisper
                 assert sampling_rate is not None, "Sampling rate must be provided."
+                max_audio_len = 30 * sampling_rate
             else:
+                max_audio_len = max([len(a) for a in audio])
+            data["audio_token_len"] = []
+            for a in audio:
+                # It's guaranteed that the number of frames is less than or equal to this amount.
+                # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
+                # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
+                nb_encoder_frames = int(round(min(len(a), max_audio_len)/ self.encoder_ds_factor + 1e-4))
+                audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
+                data["audio_token_len"].append(audio_embed_frames)
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
                 audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
+                max_length=max_audio_len,
                 return_attention_mask=True,
                 **kwargs,
             )
                 data["audio_values"] = x.input_features
             else:
                 data["audio_values"] = x.input_values
+            data["audio_len"] = x.attention_mask.sum(-1) - 1
         if text is not None:
             assert isinstance(
+                text, list
+            ), "Text must be a list."
+            processed_text = []
+            for t in text:
+                if self.audio_placeholder in t:
+                    if "audio_token_len" not in data:
+                        raise ValueError(
+                            f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
+                        )
+                    start_idx = len(
+                        self.tokenizer.encode(
+                            t[: text.t(self.audio_placeholder)],
+                            add_special_tokens=False,
+                        )
                     )
+                    data["audio_token_start_idx"] = [start_idx]
+                    # Replace the audio placeholder with the audio token.
+                    #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
+                    #        where the number of </s> is the number of audio frames.
+                    t = t.replace(
+                        self.audio_placeholder,
+                        self.audio_token_replacement * audio_embed_frames,
                     )
+                    processed_text.append(t)
             # Special tokens like BOS should already have been added by the caller.
+            data.update(self.tokenizer(processed_text, add_special_tokens=False, **kwargs))
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)