Inconstancies with EOU token ; Can any one check?

#7
by m-aliabbas1 - opened
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.restore_from("models/parakeet_realtime_eou_120m-v1/parakeet_realtime_eou_120m-v1.nemo")

# Create a simplified streaming demo
# Note: True streaming with this model requires the NeMo streaming API
# This demonstrates batch processing with configurable chunk sizes

class StreamingASR:
    def __init__(self, model, chunk_size_ms=1000):
        """
        Args:
            model: The loaded ASR model
            chunk_size_ms: Size of each audio chunk in milliseconds (1000ms default)
        """
        self.model = model
        self.sample_rate = model.cfg.sample_rate
        self.chunk_size_samples = int(chunk_size_ms * self.sample_rate / 1000)
    
    def stream_file(self, audio_file, chunk_size_ms=1000):
        """
        Process audio file in chunks to simulate streaming
        
        Args:
            audio_file: path to audio file
            chunk_size_ms: chunk duration in milliseconds
        """
        # Load audio file
        audio, sr = sf.read(audio_file)
        
        # Resample if needed
        if sr != self.sample_rate:
            try:
                import librosa
                audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
            except ImportError:
                print("Warning: librosa not installed")
        
        print(f"Processing audio file: {audio_file}")
        print(f"Audio length: {len(audio)/self.sample_rate:.2f} seconds")
        print(f"Processing in {chunk_size_ms}ms chunks...")
        print("-" * 60)
        
        # Calculate chunk size
        chunk_samples = int(chunk_size_ms * self.sample_rate / 1000)
        num_chunks = (len(audio) + chunk_samples - 1) // chunk_samples
        
        all_transcripts = []
        
        # Process each chunk
        for i in range(num_chunks):
            start_idx = i * chunk_samples
            end_idx = min(start_idx + chunk_samples, len(audio))
            chunk = audio[start_idx:end_idx]
            
            # Transcribe chunk
            try:
                result = self.model.transcribe([chunk], batch_size=1)
                
                if result and len(result) > 0:
                    text = result[0].text if hasattr(result[0], 'text') else str(result[0])
                    
                    # Check for EOU marker
                    has_eou = "<EOU>" in text or "</s>" in text
                    # text = text.replace("<EOU>", "").replace("</s>", "").strip()
                    
                    if text:
                        eou_marker = " [EOU detected]" if has_eou else ""
                        chunk_time = start_idx / self.sample_rate
                        print(f"[{chunk_time:.2f}s] Chunk {i+1}/{num_chunks}: {text}{eou_marker}")
                        all_transcripts.append(text)
            except Exception as e:
                print(f"Error in chunk {i+1}: {e}")
        
        final_transcript = " ".join(all_transcripts)
        print("-" * 60)
        print(f"Final transcript: {final_transcript}")
        
        return final_transcript

print("βœ… StreamingASR class defined")

# Create the streaming processor (using 1-second chunks for better accuracy)
streaming_asr = StreamingASR(asr_model, chunk_size_ms=1000)
print("βœ… Streaming ASR processor initialized")

Output
```
Processing audio file: 2086-149220-0033.wav
Audio length: 7.43 seconds
Processing in 1000ms chunks...

Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.04it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.04it/s]
[0.00s] Chunk 1/8: well i don't [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.25it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.25it/s]
[1.00s] Chunk 2/8: wish to see it anymore [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.21it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.21it/s]
[2.00s] Chunk 3/8: observed phoebe [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.58it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.58it/s]
[3.00s] Chunk 4/8: turning away her
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.61it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.61it/s]
[4.00s] Chunk 5/8: eyes [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.69it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.69it/s]
[5.00s] Chunk 6/8: it is certainly very [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.55it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.55it/s]
[6.00s] Chunk 7/8: like the old portrait [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.28it/s]
[7.00s] Chunk 8/8: [EOU detected]

Final transcript: well i don't wish to see it anymore observed phoebe turning away her eyes it is certainly very like the old portrait


Sign up or log in to comment