File size: 1,817 Bytes
3edbfec
 
2ba318d
 
0560145
a677076
 
2ba318d
 
a677076
5b8ef5f
a677076
294e7fb
a677076
204958c
a677076
23b8975
a677076
 
 
23b8975
 
a677076
 
 
23b8975
 
a677076
 
23b8975
 
a677076
 
23b8975
 
a677076
 
294e7fb
a677076
 
 
6d742e1
fd86276
 
a677076
 
 
 
 
fd86276
a677076
 
 
 
 
294e7fb
a677076
2ba318d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np

import librosa
import torch
from .init import processor, model

LIMIT = 90 # limit 90 seconds

class A2T:
    def __init__(self, mic):
        self.mic = mic

    def __preprocces(self, audio, frame_rate):
        try:
            print("Audio before : ", audio)
            audio = audio / 32678.0
            print("Audio div : ", audio)

            if len(audio.shape) > 1: 
                audio = librosa.to_mono(audio.T)
                
            print("Audio mono : ", audio)
    
            if frame_rate != 16_000:
                audio = librosa.resample(audio, orig_sr=frame_rate, target_sr=16000)

            print("Audio resample : ", audio)
    
            audio = audio[:16_000*LIMIT]

            print("Audio cut : ", audio)
    
            audio = torch.tensor(audio)

            print("Audio torch : ", audio)
            return audio
        except Exception as e:
            print("Preprocces error", e)
            return None
        
    def predict(self):
        if self.mic is not None:
            audio = self.mic
            # frame_rate = self.mic.frame_rate
        else:
            return "please provide audio"
    
        try:
            forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
            # audio = self.__preprocces(audio=audio, frame_rate=frame_rate)
            inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
            predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
            return transcription[0]
        except Exception as e:
            print("Predict error", e)
            return "Oops some kinda error"