# Example usage

## Install requirements

In [None]:
# Standard requirements:
# !pip install datasets torch torchcodec transformers sentencepiece
# Google Colab requirements:
!pip install "torchcodec~=0.7.0"

## Automatically instantiate the model

In [None]:
import torch
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer

model_id = "abr-ai/asr-19m-v2-en-32b"
feature_extractor = AutoFeatureExtractor.from_pretrained(
 model_id, trust_remote_code=True
)
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
feature_extractor = feature_extractor.to(device)
model = model.to(device)

## Load example data (LibriSpeech)

In [None]:
from datasets import load_dataset

dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True)
samples = list(dataset.take(3)) # Take 3 examples

## Simple transcription

In [None]:
audio = samples[0]["audio"]["array"]
features = feature_extractor(audio)
logits = model(features)
transcription = tokenizer.decode_from_logits(logits)

print(f"Reference text: {samples[0]['text'].lower()}")
print(f"Transcription: {transcription[0]}\n")

## Batched transcription

In [None]:
audio_list = [sample["audio"]["array"] for sample in samples]
batch_features = feature_extractor(audio_list)
batch_outputs = model(batch_features["input_features"], mask=batch_features["mask"])
transcriptions = tokenizer.decode_from_logits(
 batch_outputs["logits"], mask=batch_outputs["mask"]
)

for i, sample in enumerate(samples):
 print(f"Reference text: {sample['text'].lower()}")
 print(f"Transcription: {transcriptions[i]}")
 print("-"*30)