lsdrs commited on
Commit
d867b41
β€’
1 Parent(s): 1f3589e

initial commit

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +58 -0
  3. requirements.txt +7 -0
  4. utils.py +157 -0
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ“‰
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 4.16.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.50.2
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deploying AI Voice Chatbot Gradio App."""
2
+ from gradio import Audio, Interface, Textbox
3
+ from typing import Tuple
4
+
5
+ from utils import (TextGenerationPipeline, from_en_translation,
6
+ html_audio_autoplay, stt, to_en_translation, tts,
7
+ tts_to_bytesio)
8
+
9
+ max_answer_length = 100
10
+ desired_language = "pt"
11
+ response_generator_pipe = TextGenerationPipeline(max_length=max_answer_length)
12
+
13
+
14
+ def main(audio: object) -> Tuple[str, str, str, object]:
15
+ """Calls functions for deploying gradio app.
16
+
17
+ It responds both verbally and in text
18
+ by taking voice input from user.
19
+
20
+ Args:
21
+ audio (object): recorded speech of user
22
+
23
+ Returns:
24
+ tuple containing
25
+
26
+ - user_speech_text (str) : recognized speech
27
+ - bot_response_de (str) : translated answer of bot
28
+ - bot_response_en (str) : bot's original answer
29
+ - html (object) : autoplayer for bot's speech
30
+ """
31
+ user_speech_text = stt(audio, desired_language)
32
+ tranlated_text = to_en_translation(user_speech_text, desired_language)
33
+ bot_response_en = response_generator_pipe(tranlated_text)
34
+ bot_response_de = from_en_translation(bot_response_en, desired_language)
35
+ bot_voice = tts(bot_response_de, desired_language)
36
+ bot_voice_bytes = tts_to_bytesio(bot_voice)
37
+ html = html_audio_autoplay(bot_voice_bytes)
38
+ return user_speech_text, bot_response_de, bot_response_en, html
39
+
40
+
41
+ demo = Interface(
42
+ fn=main,
43
+ inputs=[
44
+ Audio(
45
+ source="microphone",
46
+ type="filepath",
47
+ ),
48
+ ],
49
+ outputs=[
50
+ Textbox(label="You said: "),
51
+ Textbox(label="AI said: "),
52
+ Textbox(label="AI said (English): "),
53
+ "html",
54
+ ],
55
+ live=True,
56
+ allow_flagging="never")
57
+
58
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ --find-links https://download.pytorch.org/whl/torch_stable.html
2
+ torch==1.13.1+cpu
3
+ gradio==3.50.2
4
+ SpeechRecognition==3.9.0
5
+ mtranslate==1.8
6
+ gTTS==2.3.0
7
+ transformers==4.25.1
utils.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Some utility functions for the app."""
2
+ from base64 import b64encode
3
+ from io import BytesIO
4
+
5
+ from gtts import gTTS
6
+ from mtranslate import translate
7
+ from speech_recognition import AudioFile, Recognizer
8
+ from transformers import (BlenderbotSmallForConditionalGeneration,
9
+ BlenderbotSmallTokenizer)
10
+
11
+
12
+ def stt(audio: object, language: str) -> str:
13
+ """Converts speech to text.
14
+
15
+ Args:
16
+ audio: record of user speech
17
+
18
+ Returns:
19
+ text (str): recognized speech of user
20
+ """
21
+ r = Recognizer()
22
+ # open the audio file
23
+ with AudioFile(audio) as source:
24
+ # listen for the data (load audio to memory)
25
+ audio_data = r.record(source)
26
+ # recognize (convert from speech to text)
27
+ text = r.recognize_google(audio_data, language=language)
28
+ return text
29
+
30
+
31
+ def to_en_translation(text: str, language: str) -> str:
32
+ """Translates text from specified language to English.
33
+
34
+ Args:
35
+ text (str): input text
36
+ language (str): desired language
37
+
38
+ Returns:
39
+ str: translated text
40
+ """
41
+ return translate(text, "en", language)
42
+
43
+
44
+ def from_en_translation(text: str, language: str) -> str:
45
+ """Translates text from english to specified language.
46
+
47
+ Args:
48
+ text (str): input text
49
+ language (str): desired language
50
+
51
+ Returns:
52
+ str: translated text
53
+ """
54
+ return translate(text, language, "en")
55
+
56
+
57
+ class TextGenerationPipeline:
58
+ """Pipeline for text generation of blenderbot model.
59
+
60
+ Returns:
61
+ str: generated text
62
+ """
63
+
64
+ # load tokenizer and the model
65
+ model_name = "facebook/blenderbot_small-90M"
66
+ tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_name)
67
+ model = BlenderbotSmallForConditionalGeneration.from_pretrained(model_name)
68
+
69
+ def __init__(self, **kwargs):
70
+ """Specififying text generation parameters.
71
+
72
+ For example: max_length=100 which generates text shorter than
73
+ 100 tokens. Visit:
74
+ https://huggingface.co/docs/transformers/main_classes/text_generation
75
+ for more parameters
76
+ """
77
+ self.__dict__.update(kwargs)
78
+
79
+ def preprocess(self, text) -> str:
80
+ """Tokenizes input text.
81
+
82
+ Args:
83
+ text (str): user specified text
84
+
85
+ Returns:
86
+ torch.Tensor (obj): text representation as tensors
87
+ """
88
+ return self.tokenizer(text, return_tensors="pt")
89
+
90
+ def postprocess(self, outputs) -> str:
91
+ """Converts tensors into text.
92
+
93
+ Args:
94
+ outputs (torch.Tensor obj): model text generation output
95
+
96
+ Returns:
97
+ str: generated text
98
+ """
99
+ return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
100
+
101
+ def __call__(self, text: str) -> str:
102
+ """Generates text from input text.
103
+
104
+ Args:
105
+ text (str): user specified text
106
+
107
+ Returns:
108
+ str: generated text
109
+ """
110
+ tokenized_text = self.preprocess(text)
111
+ output = self.model.generate(**tokenized_text, **self.__dict__)
112
+ return self.postprocess(output)
113
+
114
+
115
+ def tts(text: str, language: str) -> object:
116
+ """Converts text into audio object.
117
+
118
+ Args:
119
+ text (str): generated answer of bot
120
+
121
+ Returns:
122
+ object: text to speech object
123
+ """
124
+ return gTTS(text=text, lang=language, slow=False)
125
+
126
+
127
+ def tts_to_bytesio(tts_object: object) -> bytes:
128
+ """Converts tts object to bytes.
129
+
130
+ Args:
131
+ tts_object (object): audio object obtained from gtts
132
+
133
+ Returns:
134
+ bytes: audio bytes
135
+ """
136
+ bytes_object = BytesIO()
137
+ tts_object.write_to_fp(bytes_object)
138
+ bytes_object.seek(0)
139
+ return bytes_object.getvalue()
140
+
141
+
142
+ def html_audio_autoplay(bytes: bytes) -> object:
143
+ """Creates html object for autoplaying audio at gradio app.
144
+
145
+ Args:
146
+ bytes (bytes): audio bytes
147
+
148
+ Returns:
149
+ object: html object that provides audio autoplaying
150
+ """
151
+ b64 = b64encode(bytes).decode()
152
+ html = f"""
153
+ <audio controls autoplay>
154
+ <source src="data:audio/wav;base64,{b64}" type="audio/wav">
155
+ </audio>
156
+ """
157
+ return html