gdnartea commited on
Commit
b09cd28
1 Parent(s): f7b4e3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -21
app.py CHANGED
@@ -1,22 +1,119 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
-
4
- model_id = "nvidia/canary-1b" # update with your model id
5
- pipe = pipeline("automatic-speech-recognition", model=model_id)
6
-
7
- def transcribe_speech(filepath):
8
- output = pipe(
9
- filepath,
10
- max_new_tokens=256,
11
- generate_kwargs={
12
- "task": "transcribe",
13
- "language": "english",
14
- }, # update with the language you've fine-tuned on
15
- chunk_length_s=30,
16
- batch_size=8,
17
- )
18
- return output["text"]
19
-
20
- demo = gr.Interface(fn=transcribe_speech, inputs=gr.Audio(sources=["microphone"]), outputs="text")
21
-
22
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import json
3
+ import librosa
4
+ import os
5
+ import soundfile as sf
6
+ import tempfile
7
+ import uuid
8
+
9
+ import torch
10
+
11
+ from nemo.collections.asr.models import ASRModel
12
+ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
13
+ from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
14
+
15
+ SAMPLE_RATE = 16000 # Hz
16
+ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
17
+
18
+ model = ASRModel.from_pretrained("nvidia/canary-1b")
19
+ model.eval()
20
+
21
+ # make sure beam size always 1 for consistency
22
+ model.change_decoding_strategy(None)
23
+ decoding_cfg = model.cfg.decoding
24
+ decoding_cfg.beam.beam_size = 1
25
+ model.change_decoding_strategy(decoding_cfg)
26
+
27
+ # setup for buffered inference
28
+ model.cfg.preprocessor.dither = 0.0
29
+ model.cfg.preprocessor.pad_to = 0
30
+
31
+ feature_stride = model.cfg.preprocessor['window_stride']
32
+ model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
33
+
34
+ frame_asr = FrameBatchMultiTaskAED(
35
+ asr_model=model,
36
+ frame_len=40.0,
37
+ total_buffer=40.0,
38
+ batch_size=16,
39
+ )
40
+
41
+ amp_dtype = torch.float16
42
+
43
+
44
+ def transcribe(audio_filepath, src_lang="en", tgt_lang="en", pnc="yes"):
45
+
46
+ if audio_filepath is None:
47
+ raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
48
+
49
+ utt_id = uuid.uuid4()
50
+ with tempfile.TemporaryDirectory() as tmpdir:
51
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
52
+
53
+ # map src_lang and tgt_lang from long versions to short
54
+ LANG_LONG_TO_LANG_SHORT = {
55
+ "English": "en",
56
+ "Spanish": "es",
57
+ "French": "fr",
58
+ "German": "de",
59
+ }
60
+ if src_lang not in LANG_LONG_TO_LANG_SHORT.keys():
61
+ raise ValueError(f"src_lang must be one of {LANG_LONG_TO_LANG_SHORT.keys()}")
62
+ else:
63
+ src_lang = LANG_LONG_TO_LANG_SHORT[src_lang]
64
+
65
+ if tgt_lang not in LANG_LONG_TO_LANG_SHORT.keys():
66
+ raise ValueError(f"tgt_lang must be one of {LANG_LONG_TO_LANG_SHORT.keys()}")
67
+ else:
68
+ tgt_lang = LANG_LONG_TO_LANG_SHORT[tgt_lang]
69
+
70
+
71
+ # infer taskname from src_lang and tgt_lang
72
+ if src_lang == tgt_lang:
73
+ taskname = "asr"
74
+ else:
75
+ taskname = "s2t_translation"
76
+
77
+ # update pnc variable to be "yes" or "no"
78
+ pnc = "yes" if pnc else "no"
79
+
80
+ # make manifest file and save
81
+ manifest_data = {
82
+ "audio_filepath": converted_audio_filepath,
83
+ "source_lang": src_lang,
84
+ "target_lang": tgt_lang,
85
+ "taskname": taskname,
86
+ "pnc": pnc,
87
+ "answer": "predict",
88
+ "duration": str(duration),
89
+ }
90
+
91
+ manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
92
+
93
+ with open(manifest_filepath, 'w') as fout:
94
+ line = json.dumps(manifest_data)
95
+ fout.write(line + '\n')
96
+
97
+ # call transcribe, passing in manifest filepath
98
+ if duration < 40:
99
+ output_text = model.transcribe(manifest_filepath)[0]
100
+ else: # do buffered inference
101
+ with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
102
+ with torch.no_grad():
103
+ hyps = get_buffered_pred_feat_multitaskAED(
104
+ frame_asr,
105
+ model.cfg.preprocessor,
106
+ model_stride_in_secs,
107
+ model.device,
108
+ manifest=manifest_filepath,
109
+ filepaths=None,
110
+ )
111
+
112
+ output_text = hyps[0].text
113
+
114
+ return output_text
115
+
116
+
117
+ iface = gr.Interface(fn=transcribe, inputs=gr.Audio(sources="microphone"), outputs="text")
118
+
119
+ iface.launch()