File size: 1,817 Bytes
3edbfec 2ba318d 0560145 a677076 2ba318d a677076 5b8ef5f a677076 294e7fb a677076 204958c a677076 23b8975 a677076 23b8975 a677076 23b8975 a677076 23b8975 a677076 23b8975 a677076 294e7fb a677076 6d742e1 fd86276 a677076 fd86276 a677076 294e7fb a677076 2ba318d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import numpy as np
import librosa
import torch
from .init import processor, model
LIMIT = 90 # limit 90 seconds
class A2T:
def __init__(self, mic):
self.mic = mic
def __preprocces(self, audio, frame_rate):
try:
print("Audio before : ", audio)
audio = audio / 32678.0
print("Audio div : ", audio)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.T)
print("Audio mono : ", audio)
if frame_rate != 16_000:
audio = librosa.resample(audio, orig_sr=frame_rate, target_sr=16000)
print("Audio resample : ", audio)
audio = audio[:16_000*LIMIT]
print("Audio cut : ", audio)
audio = torch.tensor(audio)
print("Audio torch : ", audio)
return audio
except Exception as e:
print("Preprocces error", e)
return None
def predict(self):
if self.mic is not None:
audio = self.mic
# frame_rate = self.mic.frame_rate
else:
return "please provide audio"
try:
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
# audio = self.__preprocces(audio=audio, frame_rate=frame_rate)
inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription[0]
except Exception as e:
print("Predict error", e)
return "Oops some kinda error"
|