pyannote-speaker-diarization / audio /audioanalyser_francais.py
Ld75's picture
tdd + diarization
3ecd153
raw
history blame
895 Bytes
#from transformers import WhisperProcessor, WhisperForConditionalGeneration
#from datasets import Audio, load_dataset
#
## load model and processor
#processor = WhisperProcessor.from_pretrained("openai/whisper-base")
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
#forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe")
#
## load streaming dataset and read first audio sample
#input_speech = next(iter(ds))["audio"]
#input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features
#
## generate token ids
#predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
## decode token ids to text
#transcription = processor.batch_decode(predicted_ids)
#
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)