AlexHung29629
/

test_mllama_v12

Feature Extraction

Model card Files Files and versions Community

AlexHung29629 commited on Nov 26, 2024

Commit

b9c0420

·

verified ·

1 Parent(s): 8769079

Update ultravox_processing.py

Files changed (1) hide show

ultravox_processing.py +21 -20

ultravox_processing.py CHANGED Viewed

@@ -163,28 +163,29 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             processed_text = []
             data["audio_token_start_idx"] = []
             for t in text:
-                if self.audio_placeholder in t:
-                    if "audio_token_len" not in data:
-                        raise ValueError(
-                            f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
-                        )
-                    start_idx = len(
-                        self.tokenizer.encode(
-                            t.split(self.audio_placeholder)[0],
-                            add_special_tokens=False,
-                        )
                     )
-                    data["audio_token_start_idx"].append(start_idx)
-                    # Replace the audio placeholder with the audio token.
-                    #   e.g. "Transcribe <|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
-                    #        where the number of </s> is the number of audio frames.
-                    t = t.replace(
-                        self.audio_placeholder,
-                        self.audio_token_replacement * audio_embed_frames,
                     )
-                    processed_text.append(t)
             # Special tokens like BOS should already have been added by the caller.
             data.update(self.tokenizer(processed_text, add_special_tokens=False, padding='longest', **kwargs))

             processed_text = []
             data["audio_token_start_idx"] = []
             for t in text:
+                assert self.audio_placeholder in t
+                if "audio_token_len" not in data:
+                    raise ValueError(
+                        f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
                     )
+                start_idx = len(
+                    self.tokenizer.encode(
+                        t.split(self.audio_placeholder)[0],
+                        add_special_tokens=False,
                     )
+                )
+                data["audio_token_start_idx"].append(start_idx)
+                # Replace the audio placeholder with the audio token.
+                #   e.g. "Transcribe <|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
+                #        where the number of </s> is the number of audio frames.
+                t = t.replace(
+                    self.audio_placeholder,
+                    self.audio_token_replacement * audio_embed_frames,
+                )
+                processed_text.append(t)
             # Special tokens like BOS should already have been added by the caller.
             data.update(self.tokenizer(processed_text, add_special_tokens=False, padding='longest', **kwargs))