File size: 1,340 Bytes
5397a6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import librosa
import torch
def preprocess_audio(file_path, target_sr=16000):
    """
    Loads and resamples audio from the specified file.
    
    Parameters:
        file_path (str): Path to the audio file.
        target_sr (int): Target sampling rate. Defaults to 16000 Hz.
    
    Returns:
        resampled_audio (np.ndarray): Resampled audio data.
    """
    audio_input, sample_rate = librosa.load(file_path, sr=None)  # Keep original sample rate
    resampled_audio = librosa.resample(audio_input, orig_sr=sample_rate, target_sr=target_sr)
    return resampled_audio

def transcribe_audio(model, processor, audio, target_sr=16000):
    """
    Transcribes the given audio using the Whisper model.
    
    Parameters:
        model: The Whisper model.
        processor: The processor used for preparing the input features.
        audio (np.ndarray): The resampled audio data.
        target_sr (int): The target sampling rate for the audio.
    
    Returns:
        transcription (str): The transcribed text from the audio.
    """
    input_features = processor(audio, sampling_rate=target_sr, return_tensors="pt").input_features
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription