Harveenchadha commited on
Commit
60648c4
1 Parent(s): 89702e3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
+ import gradio as gr
6
+ import sox
7
+ import numpy as np
8
+ import yaml
9
+ import tensorflow as tf
10
+ from tensorflow_tts.inference import TFAutoModel
11
+ from tensorflow_tts.inference import AutoProcessor
12
+
13
+
14
+
15
+
16
+ # initialize fastspeech2 model.
17
+ fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
18
+ # initialize mb_melgan model
19
+ mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")
20
+ # inference
21
+ processor = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
22
+
23
+ def tts(text):
24
+ input_ids = processor.text_to_sequence(text)
25
+ # fastspeech inference
26
+
27
+ mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference(
28
+ input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
29
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
30
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
31
+ f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
32
+ energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
33
+ )
34
+
35
+ # melgan inference
36
+ audio_before = mb_melgan.inference(mel_before)[0, :, 0]
37
+ audio_after = mb_melgan.inference(mel_after)[0, :, 0]
38
+
39
+ # save to file
40
+ sf.write('./audio_before.wav', audio_before, 22050, "PCM_16")
41
+ sf.write('./audio_after.wav', audio_after, 22050, "PCM_16")
42
+ return './audio_after.wav'
43
+
44
+
45
+ def convert(inputfile, outfile):
46
+ sox_tfm = sox.Transformer()
47
+ sox_tfm.set_output_format(
48
+ file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
49
+ )
50
+ sox_tfm.build(inputfile, outfile)
51
+
52
+
53
+ model_translate = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
54
+ tokenizer_translate = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
55
+ inlang='hi'
56
+ outlang='en'
57
+ tokenizer_translate.src_lang = inlang
58
+ def translate(text):
59
+ encoded_hi = tokenizer_translate(text, return_tensors="pt")
60
+ generated_tokens = model_translate.generate(**encoded_hi, forced_bos_token_id=tokenizer_translate.get_lang_id(outlang))
61
+ return tokenizer_translate.batch_decode(generated_tokens, skip_special_tokens=True)[0]
62
+
63
+
64
+ processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
65
+ model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
66
+ def parse_transcription(wav_file):
67
+ filename = wav_file.name.split('.')[0]
68
+ convert(wav_file.name, filename + "16k.wav")
69
+ speech, _ = sf.read(filename + "16k.wav")
70
+ input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
71
+ logits = model(input_values).logits
72
+ predicted_ids = torch.argmax(logits, dim=-1)
73
+ transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
74
+ translation = translate(transcription)
75
+ return transcription, translation, tts(translation)
76
+
77
+
78
+
79
+
80
+
81
+ output1 = gr.outputs.Textbox(label="Hindi Output from ASR")
82
+ output2 = gr.outputs.Textbox(label="English Translated Output")
83
+
84
+ input_ = gr.inputs.Audio(source="microphone", type="file")
85
+
86
+
87
+ output_audio = gr.outputs.Audio(type="file", label="Output Audio")
88
+
89
+ gr.Interface(parse_transcription, inputs = input_, outputs=[output1, output2, output_audio], analytics_enabled=False,
90
+ show_tips=False,
91
+ theme='huggingface',
92
+ layout='vertical',
93
+ title="Vakyansh: Speech To text for Indic Languages",
94
+ description="This is a live demo for Speech to Speech Translation. Speak in Hindi and get output in English", enable_queue=True).launch( inline=False)