racai/wav2vec2-base-100k-voxpopuli-romanian

The model was fine-tuned on 300h of public and private speech data. More information will be given once the underlying paper gets published.

import librosa
from transformers import Wav2Vec2Processor, AutoModelForCTC
import torch

audio, _ = librosa.load("[audio_path]", sr=16000)
model = AutoModelForCTC.from_pretrained("racai/wav2vec2-base-100k-voxpopuli-romanian")
processor = Wav2Vec2Processor.from_pretrained("racai/wav2vec2-base-100k-voxpopuli-romanian")

input_dict = processor(audio, sampling_rate=16000, return_tensors="pt")

with torch.inference_mode():
    logits = model(input_dict.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentence = processor.batch_decode(predicted_ids)[0]

print("Prediction:", predicted_sentence)