Inconstancies with EOU token ; Can any one check?
#7
by
m-aliabbas1
- opened
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.restore_from("models/parakeet_realtime_eou_120m-v1/parakeet_realtime_eou_120m-v1.nemo")
# Create a simplified streaming demo
# Note: True streaming with this model requires the NeMo streaming API
# This demonstrates batch processing with configurable chunk sizes
class StreamingASR:
def __init__(self, model, chunk_size_ms=1000):
"""
Args:
model: The loaded ASR model
chunk_size_ms: Size of each audio chunk in milliseconds (1000ms default)
"""
self.model = model
self.sample_rate = model.cfg.sample_rate
self.chunk_size_samples = int(chunk_size_ms * self.sample_rate / 1000)
def stream_file(self, audio_file, chunk_size_ms=1000):
"""
Process audio file in chunks to simulate streaming
Args:
audio_file: path to audio file
chunk_size_ms: chunk duration in milliseconds
"""
# Load audio file
audio, sr = sf.read(audio_file)
# Resample if needed
if sr != self.sample_rate:
try:
import librosa
audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
except ImportError:
print("Warning: librosa not installed")
print(f"Processing audio file: {audio_file}")
print(f"Audio length: {len(audio)/self.sample_rate:.2f} seconds")
print(f"Processing in {chunk_size_ms}ms chunks...")
print("-" * 60)
# Calculate chunk size
chunk_samples = int(chunk_size_ms * self.sample_rate / 1000)
num_chunks = (len(audio) + chunk_samples - 1) // chunk_samples
all_transcripts = []
# Process each chunk
for i in range(num_chunks):
start_idx = i * chunk_samples
end_idx = min(start_idx + chunk_samples, len(audio))
chunk = audio[start_idx:end_idx]
# Transcribe chunk
try:
result = self.model.transcribe([chunk], batch_size=1)
if result and len(result) > 0:
text = result[0].text if hasattr(result[0], 'text') else str(result[0])
# Check for EOU marker
has_eou = "<EOU>" in text or "</s>" in text
# text = text.replace("<EOU>", "").replace("</s>", "").strip()
if text:
eou_marker = " [EOU detected]" if has_eou else ""
chunk_time = start_idx / self.sample_rate
print(f"[{chunk_time:.2f}s] Chunk {i+1}/{num_chunks}: {text}{eou_marker}")
all_transcripts.append(text)
except Exception as e:
print(f"Error in chunk {i+1}: {e}")
final_transcript = " ".join(all_transcripts)
print("-" * 60)
print(f"Final transcript: {final_transcript}")
return final_transcript
print("β
StreamingASR class defined")
# Create the streaming processor (using 1-second chunks for better accuracy)
streaming_asr = StreamingASR(asr_model, chunk_size_ms=1000)
print("β
Streaming ASR processor initialized")
Output
```
Processing audio file: 2086-149220-0033.wav
Audio length: 7.43 seconds
Processing in 1000ms chunks...
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 6.04it/s]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 6.04it/s]
[0.00s] Chunk 1/8: well i don't [EOU detected]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 6.25it/s]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 6.25it/s]
[1.00s] Chunk 2/8: wish to see it anymore [EOU detected]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 6.21it/s]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 6.21it/s]
[2.00s] Chunk 3/8: observed phoebe [EOU detected]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.58it/s]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.58it/s]
[3.00s] Chunk 4/8: turning away her
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.61it/s]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.61it/s]
[4.00s] Chunk 5/8: eyes [EOU detected]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.69it/s]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.69it/s]
[5.00s] Chunk 6/8: it is certainly very [EOU detected]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.55it/s]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 5.55it/s]
[6.00s] Chunk 7/8: like the old portrait [EOU detected]
Transcribing: 100%|ββββββββββ| 1/1 [00:00<00:00, 6.28it/s]
[7.00s] Chunk 8/8: [EOU detected]
Final transcript: well i don't wish to see it anymore observed phoebe turning away her eyes it is certainly very like the old portrait