MartinT commited on
Commit
0c7f41b
1 Parent(s): 97bd8a9

feat: Init.

Browse files
Files changed (7) hide show
  1. Dockerfile +18 -0
  2. README.md +74 -6
  3. app.py +207 -0
  4. cards.py +145 -0
  5. record.js +130 -0
  6. requirements.txt +2 -0
  7. utils.py +9 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+ RUN apt update && apt install ffmpeg
12
+
13
+ COPY . .
14
+
15
+ ENV H2O_WAVE_LISTEN=":7860"
16
+ ENV H2O_WAVE_ADDRESS='http://127.0.0.1:7860'
17
+
18
+ CMD ["wave", "run", "app", "--no-reload"]
README.md CHANGED
@@ -1,11 +1,79 @@
1
  ---
2
- title: H2o Wave Whisper
3
- emoji: 🐢
4
  colorFrom: yellow
5
- colorTo: blue
6
  sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: H2O Wave NER Annotation
3
+ emoji: 📝
4
  colorFrom: yellow
5
+ colorTo: gray
6
  sdk: docker
7
+ app_port: 7860
 
8
  ---
9
 
10
+ <div align='center'>
11
+
12
+ <h1>WaveTon</h1>
13
+ 💯 Wave applications
14
+
15
+ <br>
16
+ <br>
17
+
18
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg?logo=apache)](https://github.com/vopani/waveton/blob/master/LICENSE)
19
+ [![GitHub](https://img.shields.io/github/stars/vopani/waveton?color=yellowgreen&logo=github)](https://img.shields.io/github/stars/vopani/waveton?color=yellowgreen&logo=github)
20
+ [![Twitter](https://img.shields.io/twitter/follow/vopani)](https://twitter.com/vopani)
21
+
22
+ </div>
23
+
24
+ ## Whisper 🖥️
25
+
26
+ Speech to text using OpenAI's Whisper model.
27
+
28
+ ![](demo.gif)
29
+
30
+ ## Setup ⚙️
31
+
32
+ 1. Check the version of Python, must be Python 3.9+ but recommended to use Python 3.10+ for best experience
33
+
34
+ ```commandline
35
+ python3 --version
36
+ ```
37
+
38
+ 2. Clone the repository
39
+
40
+ ```commandline
41
+ git clone https://github.com/vopani/waveton.git
42
+ ```
43
+
44
+ 3. Create a virtual environment
45
+
46
+ ```commandline
47
+ cd waveton/apps/deeplearning_apps/whisper
48
+ python3 -m venv venv
49
+ source venv/bin/activate
50
+ ```
51
+
52
+ 4. Install ffmpeg
53
+
54
+ On Linux:
55
+
56
+ ```commandline
57
+ sudo apt update && sudo apt install ffmpeg
58
+ ```
59
+
60
+ On Mac:
61
+
62
+ ```commandline
63
+ brew install ffmpeg
64
+ ```
65
+
66
+ 5. Install the packages
67
+
68
+ ```commandline
69
+ python3 -m pip install -U pip
70
+ python3 -m pip install -r requirements.txt
71
+ ```
72
+
73
+ 6. Run the application
74
+
75
+ ```commandline
76
+ wave run app
77
+ ```
78
+
79
+ 7. View the application on your local browser: [http://localhost:10101](http://localhost:10101)
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from h2o_wave import Q, main, app, copy_expando, handle_on, on
4
+ import whisper
5
+
6
+ import cards
7
+ from utils import get_inline_script
8
+
9
+ # Set up logging
10
+ logging.basicConfig(format='%(levelname)s:\t[%(asctime)s]\t%(message)s', level=logging.INFO)
11
+
12
+
13
+ @app('/')
14
+ async def serve(q: Q):
15
+ """
16
+ Main entry point. All queries pass through this function.
17
+ """
18
+
19
+ try:
20
+ # Initialize the app if not already
21
+ if not q.app.initialized:
22
+ await initialize_app(q)
23
+
24
+ # Initialize the client if not already
25
+ if not q.client.initialized:
26
+ await initialize_client(q)
27
+
28
+ # Update theme if toggled
29
+ elif q.args.theme_dark is not None and q.args.theme_dark != q.client.theme_dark:
30
+ await update_theme(q)
31
+
32
+ # Run inference if audio is recorded
33
+ elif q.events.audio:
34
+ await audio_inference(q)
35
+
36
+ # Delegate query to query handlers
37
+ elif await handle_on(q):
38
+ pass
39
+
40
+ # Adding this condition to help in identifying bugs
41
+ else:
42
+ await handle_fallback(q)
43
+
44
+ except Exception as error:
45
+ await show_error(q, error=str(error))
46
+
47
+
48
+ async def initialize_app(q: Q):
49
+ """
50
+ Initialize the app.
51
+ """
52
+
53
+ logging.info('Initializing app')
54
+
55
+ # Set initial argument values
56
+ q.app.cards = ['main', 'error']
57
+
58
+ q.app.model = whisper.load_model('base')
59
+
60
+ q.app.initialized = True
61
+
62
+
63
+ async def initialize_client(q: Q):
64
+ """
65
+ Initialize the client (browser tab).
66
+ """
67
+
68
+ logging.info('Initializing client')
69
+
70
+ # Set initial argument values
71
+ q.client.theme_dark = True
72
+
73
+ # Add layouts, scripts, header and footer
74
+ q.page['meta'] = cards.meta
75
+ q.page['header'] = cards.header
76
+ q.page['footer'] = cards.footer
77
+
78
+ # Add cards for the main page
79
+ q.page['asr'] = cards.asr()
80
+
81
+ q.client.initialized = True
82
+
83
+ await q.page.save()
84
+
85
+
86
+ async def update_theme(q: Q):
87
+ """
88
+ Update theme of app.
89
+ """
90
+
91
+ # Copying argument values to client
92
+ copy_expando(q.args, q.client)
93
+
94
+ if q.client.theme_dark:
95
+ logging.info('Updating theme to dark mode')
96
+
97
+ # Update theme from light to dark mode
98
+ q.page['meta'].theme = 'h2o-dark'
99
+ q.page['header'].icon_color = 'black'
100
+ else:
101
+ logging.info('Updating theme to light mode')
102
+
103
+ # Update theme from dark to light mode
104
+ q.page['meta'].theme = 'light'
105
+ q.page['header'].icon_color = '#FEC924'
106
+
107
+ await q.page.save()
108
+
109
+
110
+ @on('start')
111
+ async def start_recording(q: Q):
112
+ """
113
+ Start recording audio.
114
+ """
115
+
116
+ logging.info('Starting recording')
117
+
118
+ q.page['meta'].script = get_inline_script('startRecording()')
119
+ q.page['asr'] = cards.asr(recording=True)
120
+
121
+ await q.page.save()
122
+
123
+
124
+ @on('stop')
125
+ async def stop_recording(q: Q):
126
+ """
127
+ Stop recording audio.
128
+ """
129
+
130
+ logging.info('Stopping recording')
131
+
132
+ q.page['meta'].script = get_inline_script('stopRecording()')
133
+ q.page['asr'] = cards.asr()
134
+
135
+ await q.page.save()
136
+
137
+
138
+ @on('audio')
139
+ async def audio_inference(q: Q):
140
+ """
141
+ Running ASR inference on audio.
142
+ """
143
+
144
+ logging.info('Inferencing recorded audio')
145
+
146
+ audio_path = await q.site.download(q.events.audio.captured, '.')
147
+
148
+ q.client.transcription = q.app.model.transcribe(audio_path)['text']
149
+
150
+ q.page['asr'] = cards.asr(audio_path=q.events.audio.captured, transcription=q.client.transcription)
151
+
152
+ await q.page.save()
153
+
154
+
155
+ def clear_cards(q: Q, card_names: list):
156
+ """
157
+ Clear cards from the page.
158
+ """
159
+
160
+ logging.info('Clearing cards')
161
+
162
+ # Delete cards from the page
163
+ for card_name in card_names:
164
+ del q.page[card_name]
165
+
166
+
167
+ async def show_error(q: Q, error: str):
168
+ """
169
+ Displays errors.
170
+ """
171
+
172
+ logging.error(error)
173
+
174
+ # Clear all cards
175
+ clear_cards(q, q.app.cards)
176
+
177
+ # Format and display the error
178
+ q.page['error'] = cards.crash_report(q)
179
+
180
+ await q.page.save()
181
+
182
+
183
+ @on('reload')
184
+ async def reload_client(q: Q):
185
+ """
186
+ Reset the client.
187
+ """
188
+
189
+ logging.info('Reloading client')
190
+
191
+ # Clear all cards
192
+ clear_cards(q, q.app.cards)
193
+
194
+ # Reload the client
195
+ await initialize_client(q)
196
+
197
+
198
+ async def handle_fallback(q: Q):
199
+ """
200
+ Handle fallback cases.
201
+ """
202
+
203
+ logging.info('Adding fallback page')
204
+
205
+ q.page['fallback'] = cards.fallback
206
+
207
+ await q.page.save()
cards.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import traceback
3
+
4
+ from h2o_wave import Q, expando_to_dict, ui
5
+
6
+ # App name
7
+ app_name = 'Whisper'
8
+
9
+ # Link to repo. Report bugs/features here :)
10
+ repo_url = 'https://github.com/vopani/waveton'
11
+ issue_url = f'{repo_url}/issues/new?assignees=vopani&labels=bug&template=error-report.md&title=%5BERROR%5D'
12
+
13
+ # JS scripts
14
+ encoder_url = 'https://cdn.jsdelivr.net/npm/opus-media-recorder@latest/encoderWorker.umd.js'
15
+ recorder_url = 'https://cdn.jsdelivr.net/npm/opus-media-recorder@latest/OpusMediaRecorder.umd.js'
16
+
17
+ with open('record.js', encoding='utf-8') as f:
18
+ recorder_script = ui.inline_script(f.read())
19
+
20
+ # A meta card to hold the app's title, layouts, dialogs, theme and other meta information
21
+ meta = ui.meta_card(
22
+ box='',
23
+ title='WaveTon',
24
+ layouts=[
25
+ ui.layout(
26
+ breakpoint='xs',
27
+ zones=[
28
+ ui.zone(name='header'),
29
+ ui.zone(name='main'),
30
+ ui.zone(name='footer')
31
+ ]
32
+ )
33
+ ],
34
+ theme='h2o-dark',
35
+ scripts=[
36
+ ui.script(encoder_url, asynchronous=False),
37
+ ui.script(recorder_url, asynchronous=False)
38
+ ],
39
+ script=recorder_script
40
+ )
41
+
42
+ # The header shown on all the app's pages
43
+ header = ui.header_card(
44
+ box='header',
45
+ title='Whisper',
46
+ subtitle="Speech to text using OpenAI's Whisper model",
47
+ icon='Microphone',
48
+ icon_color='black',
49
+ items=[ui.toggle(name='theme_dark', label='Dark Mode', value=True, trigger=True)]
50
+ )
51
+
52
+ # The footer shown on all the app's pages
53
+ footer = ui.footer_card(
54
+ box='footer',
55
+ caption=f'Learn more about <a href="{repo_url}" target="_blank"> WaveTon: 💯 Wave Applications</a>'
56
+ )
57
+
58
+ # A fallback card for handling bugs
59
+ fallback = ui.form_card(
60
+ box='fallback',
61
+ items=[ui.text('Uh-oh, something went wrong!')]
62
+ )
63
+
64
+
65
+ def asr(recording: bool = False, audio_path: str = None, transcription: str = '') -> ui.FormCard:
66
+ """
67
+ Card for Automatic Speech Recognition.
68
+ """
69
+
70
+ button_name = 'stop' if recording else 'start'
71
+ button_label = '⏹️ Stop Recording' if recording else '🎙️ Start Recording'
72
+ visible = False if audio_path is None else True
73
+
74
+ card = ui.form_card(
75
+ box='main',
76
+ items=[
77
+ ui.separator(label='Microphone'),
78
+ ui.buttons(items=[ui.button(name=button_name, label=button_label, primary=True)], justify='center'),
79
+ ui.progress(label='Recording...', caption='', visible=recording),
80
+ ui.separator(label='Audio', visible=visible),
81
+ ui.text(
82
+ content=f'''<center>
83
+ <audio controls><source src="{audio_path}" type="audio/wav"></source></audio>
84
+ <center>''',
85
+ visible=visible
86
+ ),
87
+ ui.separator(label='Transcription', visible=visible),
88
+ ui.textbox(name='transcription', value=transcription, multiline=True, visible=visible)
89
+ ]
90
+ )
91
+
92
+ return card
93
+
94
+
95
+ def crash_report(q: Q) -> ui.FormCard:
96
+ """
97
+ Card for capturing the stack trace and current application state, for error reporting.
98
+ This function is called by the main serve() loop on uncaught exceptions.
99
+ """
100
+
101
+ def code_block(content): return '\n'.join(['```', *content, '```'])
102
+
103
+ type_, value_, traceback_ = sys.exc_info()
104
+ stack_trace = traceback.format_exception(type_, value_, traceback_)
105
+
106
+ dump = [
107
+ '### Stack Trace',
108
+ code_block(stack_trace),
109
+ ]
110
+
111
+ states = [
112
+ ('q.app', q.app),
113
+ ('q.user', q.user),
114
+ ('q.client', q.client),
115
+ ('q.events', q.events),
116
+ ('q.args', q.args)
117
+ ]
118
+ for name, source in states:
119
+ dump.append(f'### {name}')
120
+ dump.append(code_block([f'{k}: {v}' for k, v in expando_to_dict(source).items()]))
121
+
122
+ return ui.form_card(
123
+ box='main',
124
+ items=[
125
+ ui.stats(
126
+ items=[
127
+ ui.stat(
128
+ label='',
129
+ value='Oops!',
130
+ caption='Something went wrong',
131
+ icon='Error'
132
+ )
133
+ ],
134
+ ),
135
+ ui.separator(),
136
+ ui.text_l(content='Apologies for the inconvenience!'),
137
+ ui.buttons(items=[ui.button(name='reload', label='Reload', primary=True)]),
138
+ ui.expander(name='report', label='Error Details', items=[
139
+ ui.text(
140
+ f'To report this issue, <a href="{issue_url}" target="_blank">please open an issue</a> with the details below:'),
141
+ ui.text_l(content=f'Report Issue in App: **{app_name}**'),
142
+ ui.text(content='\n'.join(dump)),
143
+ ])
144
+ ]
145
+ )
record.js ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Shim for Safari.
2
+ window.AudioContext = window.AudioContext || window.webkitAudioContext
3
+
4
+ function audioBufferToWav(buffer, opt) {
5
+ opt = opt || {}
6
+ var numChannels = buffer.numberOfChannels
7
+ var sampleRate = buffer.sampleRate
8
+ var format = opt.float32 ? 3 : 1
9
+ var bitDepth = format === 3 ? 32 : 16
10
+ var result
11
+ if (numChannels === 2) {
12
+ result = interleave(buffer.getChannelData(0), buffer.getChannelData(1))
13
+ } else {
14
+ result = buffer.getChannelData(0)
15
+ }
16
+ return encodeWAV(result, format, sampleRate, numChannels, bitDepth)
17
+ }
18
+
19
+ function encodeWAV(samples, format, sampleRate, numChannels, bitDepth) {
20
+ var bytesPerSample = bitDepth / 8
21
+ var blockAlign = numChannels * bytesPerSample
22
+ var buffer = new ArrayBuffer(44 + samples.length * bytesPerSample)
23
+ var view = new DataView(buffer)
24
+ /* RIFF identifier */
25
+ writeString(view, 0, 'RIFF')
26
+ /* RIFF chunk length */
27
+ view.setUint32(4, 36 + samples.length * bytesPerSample, true)
28
+ /* RIFF type */
29
+ writeString(view, 8, 'WAVE')
30
+ /* format chunk identifier */
31
+ writeString(view, 12, 'fmt ')
32
+ /* format chunk length */
33
+ view.setUint32(16, 16, true)
34
+ /* sample format (raw) */
35
+ view.setUint16(20, format, true)
36
+ /* channel count */
37
+ view.setUint16(22, numChannels, true)
38
+ /* sample rate */
39
+ view.setUint32(24, sampleRate, true)
40
+ /* byte rate (sample rate * block align) */
41
+ view.setUint32(28, sampleRate * blockAlign, true)
42
+ /* block align (channel count * bytes per sample) */
43
+ view.setUint16(32, blockAlign, true)
44
+ /* bits per sample */
45
+ view.setUint16(34, bitDepth, true)
46
+ /* data chunk identifier */
47
+ writeString(view, 36, 'data')
48
+ /* data chunk length */
49
+ view.setUint32(40, samples.length * bytesPerSample, true)
50
+ if (format === 1) { // Raw PCM
51
+ floatTo16BitPCM(view, 44, samples)
52
+ } else {
53
+ writeFloat32(view, 44, samples)
54
+ }
55
+ return buffer
56
+ }
57
+
58
+ function interleave(inputL, inputR) {
59
+ var length = inputL.length + inputR.length
60
+ var result = new Float32Array(length)
61
+ var index = 0
62
+ var inputIndex = 0
63
+ while (index < length) {
64
+ result[index++] = inputL[inputIndex]
65
+ result[index++] = inputR[inputIndex]
66
+ inputIndex++
67
+ }
68
+ return result
69
+ }
70
+
71
+ function writeFloat32(output, offset, input) {
72
+ for (var i = 0; i < input.length; i++, offset += 4) {
73
+ output.setFloat32(offset, input[i], true)
74
+ }
75
+ }
76
+
77
+ function floatTo16BitPCM(output, offset, input) {
78
+ for (var i = 0; i < input.length; i++, offset += 2) {
79
+ var s = Math.max(-1, Math.min(1, input[i]))
80
+ output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true)
81
+ }
82
+ }
83
+
84
+ function writeString(view, offset, string) {
85
+ for (var i = 0; i < string.length; i++) {
86
+ view.setUint8(offset + i, string.charCodeAt(i))
87
+ }
88
+ }
89
+
90
+ // Safari does not support promise-based decodeAudioData, need to use callback instead.
91
+ const decodeAudioData = buffer => new Promise((res, rej) => {
92
+ new AudioContext().decodeAudioData(buffer, res, rej)
93
+ })
94
+ const startRecording = async () => {
95
+ const data = []
96
+ // Ask for mic permissions.
97
+ const stream = await navigator.mediaDevices.getUserMedia({ video: false, audio: true })
98
+ window.stream = stream
99
+ // Use polyfill for older browsers.
100
+ if (!window.MediaRecorder) {
101
+ window.MediaRecorder = OpusMediaRecorder
102
+ window.recorder = new MediaRecorder(stream, {}, {
103
+ OggOpusEncoderWasmPath: 'https://cdn.jsdelivr.net/npm/opus-media-recorder@latest/OggOpusEncoder.wasm',
104
+ WebMOpusEncoderWasmPath: 'https://cdn.jsdelivr.net/npm/opus-media-recorder@latest/WebMOpusEncoder.wasm'
105
+ })
106
+ }
107
+ else window.recorder = new MediaRecorder(stream)
108
+ // Handle incoming data.
109
+ window.recorder.ondataavailable = e => data.push(e.data)
110
+ window.recorder.start()
111
+ window.recorder.onerror = e => { throw e.error || new Error(e.name) }
112
+ window.recorder.onstop = async (e) => {
113
+ const blob = new Blob(data)
114
+ const fetchedBlob = await fetch(URL.createObjectURL(blob))
115
+ const arrayBuffer = await fetchedBlob.arrayBuffer()
116
+ // Convert to wav format.
117
+ const wav = audioBufferToWav(await decodeAudioData(arrayBuffer))
118
+ const formData = new FormData()
119
+ formData.append('files', new Blob([wav], { type: 'audio/wave' }), 'sound.wav')
120
+ // Send the audio file to Wave server.
121
+ const res = await fetch(wave.uploadURL, { method: 'POST', body: formData })
122
+ const { files } = await res.json()
123
+ // Emit event (q.events.audio.captured) with a URL of the audio file at Wave server.
124
+ window.wave.emit('audio', 'captured', files[0])
125
+ }
126
+ }
127
+ const stopRecording = () => {
128
+ window.recorder.stop()
129
+ window.stream.getTracks().forEach(track => track.stop())
130
+ }
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ h2o_wave==0.23.1
2
+ git+https://github.com/openai/whisper.git
utils.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from h2o_wave import ui
2
+
3
+
4
+ def get_inline_script(text: str) -> ui.InlineScript:
5
+ """
6
+ Get Wave's Inline Script.
7
+ """
8
+
9
+ return ui.inline_script(text)