sasan commited on
Commit
961640d
1 Parent(s): 14aa1e4

speech to text notebook

Browse files
Files changed (1) hide show
  1. stt.ipynb +292 -0
stt.ipynb ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 25,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
13
+ " _torch_pytree._register_pytree_node(\n",
14
+ "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
15
+ " _torch_pytree._register_pytree_node(\n",
16
+ "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
17
+ " _torch_pytree._register_pytree_node(\n"
18
+ ]
19
+ }
20
+ ],
21
+ "source": [
22
+ "#STT (speech to text)\n",
23
+ "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
24
+ "from transformers import pipeline\n",
25
+ "import librosa"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 3,
31
+ "metadata": {},
32
+ "outputs": [
33
+ {
34
+ "name": "stderr",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "# load model and processor for speech-to-text\n",
43
+ "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\")\n",
44
+ "modelw = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-small\")\n",
45
+ "# modelw.config.forced_decoder_ids = None"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 51,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\""
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 52,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stderr",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\", device=device)"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 53,
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "data": {
81
+ "text/plain": [
82
+ "device(type='cpu')"
83
+ ]
84
+ },
85
+ "execution_count": 53,
86
+ "metadata": {},
87
+ "output_type": "execute_result"
88
+ }
89
+ ],
90
+ "source": [
91
+ "transcriber.device"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 58,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "audio = \"/Users/sasan.jafarnejad/dev/uni/talking-car/audio1713724521.779008.wav\"\n",
101
+ "audio = \"/Users/sasan.jafarnejad/dev/uni/talking-car/audio/attenborough/neutral.wav\"\n",
102
+ "if type(audio) == str:\n",
103
+ " link_to_audio = audio\n",
104
+ " audio = torchaudio.load(link_to_audio)\n",
105
+ " audio = audio[0].squeeze().numpy(), sr\n",
106
+ " if len(audio[0].shape) == 2:\n",
107
+ " audio = audio[0].mean(axis=0), audio[1]"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 59,
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "y, sr = audio"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 60,
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "out = transcriber({\"sampling_rate\": sr, \"raw\":y})[\"text\"]"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 61,
131
+ "metadata": {},
132
+ "outputs": [
133
+ {
134
+ "data": {
135
+ "text/plain": [
136
+ "\" If you speed up time, plants begin to reveal their true nature. They're not passive organisms, as you might think, but competitive creatures every bit as aggressive as animals. They're locked in a desperate battle for light and space. They stretch and pulse as they strive to barge their way into pole position. Creepers and vines reach around for the branch or stem of another plant on which to hitch a ride. This is an assassin bug. To us, it's easy enough to spot because it moves. To its prey, that's irrelevant, because it smells like one of their number. The assassin sucks its victims dry and blues their empty husks onto its back. This one is already carrying at least 20 corpses. Its irregular shape makes it hard for other predators to spot it and makes it virtually invisible to its prey, ants. It enters this ant colony unchallenged. Its coat of ant corpses masks its own odour. To the ants, it smells like one of their own, and that's what matters. They'll even run straight over the top of it. The assassin simply takes an ant whenever it feels hungry, and the body of each victim then adds to its disguise. The giants here too. This is the Moala Moala, the sunfish. It's huge, three meters across, and addicted to lying on its side at the surface. It eats vast quantities of jellyfish. And there are not only fish from me in these waters, there are mammals. sea lions, whose ancestors originally came from the coasts of California. The Galapagos plankton is so abundant it attracts some of the biggest of all ocean mammals, humpback whales, and rivalling them in size the biggest of all fish. 20 ton whale shark. Few parts of the world's oceans can equally as Galapagos waters for sheer variety and abundance. That creature was a penguin. Penguins are ocean-goings for moose. But a few thousand years ago some of them got caught in the cold waters of the Humboldt current and were carried northwards up the coast of South America and out to the Galapagos. They could hardly have found anywhere more different from their polar home and in and the response they chain, the Emperor penguin that lives near the South Pole stands over a metre high, Galapagos penguin is now only half, and that helps a lot in the Galapagos. Small animals lose heat much faster than big ones, and the penguins have developed behavioral tricks as well. Bear feet are easily sunburnt, so they do their best to keep them covered, and some parts of the sea around the islands are quite cool. The humbalt current flowing up from the Antarctic and washing around the western parts of the archipelago is still quite chilly. So most of the penguins stay in the channel between the two westernmost island and when things get really hot, they can still cool off with the swim. They're quick to detect the slightest variation in temperature and move around to find places where an eddy might have brought a pleasing chill. The arrival of penguins must be the most unlikely event in the whole story of the colonization of the Galapagos. The existence of creatures like these so far from the nearest continent poses many questions. How, for example, did these enormous beasts get to the islands in the first place? But perhaps the most extraordinary thing about the Galapagos tortoises is that they're not all the same. islands have different kinds. In the heyday there were 15 species. They seem to have appeared in an evolutionary blink of the eye. But will soon become a leaf. It's no ordinary. It has a special altogether more sinister. This is Nepenthes, the pitcher plant. It grows in nutrient poor soils, So has to find nitrogen and minerals in another way. The leaf, just like a flower, attracts insects with a reward. The pitcher is coloured and scented to appeal to flies looking for a meal of rotting flesh. The visitors are rewarded with a greasy substance on the underside of the pitcher's lid. But the plant wants something in return, not pollen, but a meal. The lip of the pitcher is covered in tiny slippery ridges. Wax lubricates the surface further. It's extremely difficult to hold on, even for a fly. Once inside, there's no escape. The leaf holds a pool of digestive liquid. This contains microscopic elastic filaments, which give it the properties of quicksand. The more the insect struggles, the deeper it sinks. Enzymes begin to dissolve the victim's body while it's still alive.\""
137
+ ]
138
+ },
139
+ "execution_count": 61,
140
+ "metadata": {},
141
+ "output_type": "execute_result"
142
+ }
143
+ ],
144
+ "source": [
145
+ "out"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": null,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "def transcript(audio):\n",
155
+ " if type(audio) == str:\n",
156
+ " link_to_audio = audio\n",
157
+ " audio = torchaudio.load(link_to_audio)\n",
158
+ "\n",
159
+ " # We assume that the audio is the audio tensor\n",
160
+ "\n",
161
+ " # process the audio array\n",
162
+ " input_features = processor(audio_array, sampling_rate, return_tensors=\"pt\").input_features\n",
163
+ " predicted_ids = modelw.generate(input_features)\n",
164
+ "\n",
165
+ " transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
166
+ " \n",
167
+ " return audio_path, state['context'], state"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "def transcript(general_context, link_to_audio, voice, place, time, delete_history, state):\n",
177
+ " \"\"\"this function manages speech-to-text to input Fnanswer function and text-to-speech with the Fnanswer output\"\"\"\n",
178
+ " # load audio from a specific path\n",
179
+ " audio_path = link_to_audio\n",
180
+ " audio_array, sampling_rate = librosa.load(link_to_audio, sr=16000) # \"sr=16000\" ensures that the sampling rate is as required\n",
181
+ "\n",
182
+ " # process the audio array\n",
183
+ " input_features = processor(audio_array, sampling_rate, return_tensors=\"pt\").input_features\n",
184
+ " predicted_ids = modelw.generate(input_features)\n",
185
+ "\n",
186
+ " transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
187
+ " \n",
188
+ "\n",
189
+ "\n",
190
+ " return audio_path, state['context'], state"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 64,
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "name": "stdout",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "Running on local URL: http://127.0.0.1:7875\n",
203
+ "\n",
204
+ "To create a public link, set `share=True` in `launch()`.\n"
205
+ ]
206
+ },
207
+ {
208
+ "data": {
209
+ "text/html": [
210
+ "<div><iframe src=\"http://127.0.0.1:7875/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
211
+ ],
212
+ "text/plain": [
213
+ "<IPython.core.display.HTML object>"
214
+ ]
215
+ },
216
+ "metadata": {},
217
+ "output_type": "display_data"
218
+ },
219
+ {
220
+ "data": {
221
+ "text/plain": []
222
+ },
223
+ "execution_count": 64,
224
+ "metadata": {},
225
+ "output_type": "execute_result"
226
+ }
227
+ ],
228
+ "source": [
229
+ "import numpy as np\n",
230
+ "import gradio as gr\n",
231
+ "import torchaudio\n",
232
+ "import time\n",
233
+ "import torch\n",
234
+ "\n",
235
+ "def save_audio_as_wav(data, sample_rate, file_path):\n",
236
+ " # make a tensor from the numpy array\n",
237
+ " data = torch.tensor(data).reshape(1, -1)\n",
238
+ " torchaudio.save(file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding=\"PCM_S\")\n",
239
+ "\n",
240
+ "def save_and_transcribe_audio(audio):\n",
241
+ " # capture the audio and save it to a file as wav or mp3\n",
242
+ " # file_name = save(\"audioinput.wav\")\n",
243
+ " sr, y = audio\n",
244
+ " # y = y.astype(np.float32)\n",
245
+ " # y /= np.max(np.abs(y))\n",
246
+ "\n",
247
+ " # add timestamp to file name\n",
248
+ " filename = f\"audio{time.time()}.wav\"\n",
249
+ " save_audio_as_wav(y, sr, filename)\n",
250
+ " \n",
251
+ " sr, y = audio\n",
252
+ " y = y.astype(np.float32)\n",
253
+ " y /= np.max(np.abs(y))\n",
254
+ " text = transcriber({\"sampling_rate\": sr, \"raw\":y})[\"text\"]\n",
255
+ " return text\n",
256
+ "\n",
257
+ "gr.Interface(\n",
258
+ " fn=save_and_transcribe_audio, \n",
259
+ " inputs=gr.Audio(sources=\"microphone\", type=\"numpy\", label=\"Record Audio\"), \n",
260
+ " outputs=\"text\").launch()"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": []
269
+ }
270
+ ],
271
+ "metadata": {
272
+ "kernelspec": {
273
+ "display_name": "llm",
274
+ "language": "python",
275
+ "name": "python3"
276
+ },
277
+ "language_info": {
278
+ "codemirror_mode": {
279
+ "name": "ipython",
280
+ "version": 3
281
+ },
282
+ "file_extension": ".py",
283
+ "mimetype": "text/x-python",
284
+ "name": "python",
285
+ "nbconvert_exporter": "python",
286
+ "pygments_lexer": "ipython3",
287
+ "version": "3.11.8"
288
+ }
289
+ },
290
+ "nbformat": 4,
291
+ "nbformat_minor": 2
292
+ }