GRATITUD3 reach-vb HF staff commited on
Commit
150b49f
0 Parent(s):

Duplicate from facebook/seamless_m4t

Browse files

Co-authored-by: Vaibhav Srivastav <reach-vb@users.noreply.huggingface.co>

Files changed (9) hide show
  1. .gitattributes +36 -0
  2. Dockerfile +64 -0
  3. README.md +12 -0
  4. app.py +412 -0
  5. assets/sample_input.mp3 +3 -0
  6. assets/sample_input_2.mp3 +3 -0
  7. lang_list.py +254 -0
  8. requirements.txt +4 -0
  9. style.css +16 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+ ENV DEBIAN_FRONTEND=noninteractive
3
+ RUN apt-get update && \
4
+ apt-get upgrade -y && \
5
+ apt-get install -y --no-install-recommends \
6
+ git \
7
+ git-lfs \
8
+ wget \
9
+ curl \
10
+ # python build dependencies \
11
+ build-essential \
12
+ libssl-dev \
13
+ zlib1g-dev \
14
+ libbz2-dev \
15
+ libreadline-dev \
16
+ libsqlite3-dev \
17
+ libncursesw5-dev \
18
+ xz-utils \
19
+ tk-dev \
20
+ libxml2-dev \
21
+ libxmlsec1-dev \
22
+ libffi-dev \
23
+ liblzma-dev \
24
+ # gradio dependencies \
25
+ ffmpeg \
26
+ # fairseq2 dependencies \
27
+ libsndfile-dev && \
28
+ apt-get clean && \
29
+ rm -rf /var/lib/apt/lists/*
30
+
31
+ RUN useradd -m -u 1000 user
32
+ USER user
33
+ ENV HOME=/home/user \
34
+ PATH=/home/user/.local/bin:${PATH}
35
+ WORKDIR ${HOME}/app
36
+
37
+ RUN curl https://pyenv.run | bash
38
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
39
+ ARG PYTHON_VERSION=3.10.12
40
+ RUN pyenv install ${PYTHON_VERSION} && \
41
+ pyenv global ${PYTHON_VERSION} && \
42
+ pyenv rehash && \
43
+ pip install --no-cache-dir -U pip setuptools wheel
44
+
45
+ RUN pip install --no-cache-dir torch==2.0.1 gradio==3.40.1 && \
46
+ pip install --extra-index-url https://test.pypi.org/simple/ fairseq2==0.1.0rc0
47
+ RUN git clone https://github.com/facebookresearch/seamless_communication && \
48
+ cd seamless_communication && \
49
+ pip install . && \
50
+ cd .. && \
51
+ rm -rf seamless_communication
52
+
53
+ COPY ./requirements.txt /tmp/requirements.txt
54
+ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
55
+
56
+ COPY --chown=1000 . ${HOME}/app
57
+ ENV PYTHONPATH=${HOME}/app \
58
+ PYTHONUNBUFFERED=1 \
59
+ GRADIO_ALLOW_FLAGGING=never \
60
+ GRADIO_NUM_PORTS=1 \
61
+ GRADIO_SERVER_NAME=0.0.0.0 \
62
+ GRADIO_THEME=huggingface \
63
+ SYSTEM=spaces
64
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Seamless M4T
3
+ emoji: 📞
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ suggested_hardware: t4-medium
9
+ duplicated_from: facebook/seamless_m4t
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from typing import Union
4
+ import gradio as gr
5
+ import numpy as np
6
+ import torch
7
+ import torchaudio
8
+ from seamless_communication.models.inference.translator import Translator
9
+
10
+ from lang_list import (
11
+ LANGUAGE_NAME_TO_CODE,
12
+ S2ST_TARGET_LANGUAGE_NAMES,
13
+ S2TT_TARGET_LANGUAGE_NAMES,
14
+ T2TT_TARGET_LANGUAGE_NAMES,
15
+ TEXT_SOURCE_LANGUAGE_NAMES,
16
+ )
17
+
18
+ DESCRIPTION = """# SeamlessM4T
19
+
20
+ [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
21
+ translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
22
+
23
+ This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
24
+ translation and more, without relying on multiple separate models.
25
+ """
26
+
27
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"
28
+
29
+ TASK_NAMES = [
30
+ "S2ST (Speech to Speech translation)",
31
+ "S2TT (Speech to Text translation)",
32
+ "T2ST (Text to Speech translation)",
33
+ "T2TT (Text to Text translation)",
34
+ "ASR (Automatic Speech Recognition)",
35
+ ]
36
+ AUDIO_SAMPLE_RATE = 16000.0
37
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
38
+ DEFAULT_TARGET_LANGUAGE = "French"
39
+
40
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
41
+ translator = Translator(
42
+ model_name_or_card="seamlessM4T_large",
43
+ vocoder_name_or_card="vocoder_36langs",
44
+ device=device,
45
+ sample_rate=AUDIO_SAMPLE_RATE,
46
+ )
47
+
48
+
49
+ def predict(
50
+ task_name: str,
51
+ audio_source: str,
52
+ input_audio_mic: Union[str, None],
53
+ input_audio_file: Union[str, None],
54
+ input_text: Union[str, None],
55
+ source_language: Union[str, None],
56
+ target_language: str,
57
+ ) -> tuple[Union[tuple[int, np.ndarray], None], str]:
58
+ task_name = task_name.split()[0]
59
+ source_language_code = LANGUAGE_NAME_TO_CODE.get(source_language, None)
60
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
61
+
62
+ if task_name in ["S2ST", "S2TT", "ASR"]:
63
+ if audio_source == "microphone":
64
+ input_data = input_audio_mic
65
+ else:
66
+ input_data = input_audio_file
67
+
68
+ arr, org_sr = torchaudio.load(input_data)
69
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
70
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
71
+ if new_arr.shape[1] > max_length:
72
+ new_arr = new_arr[:, :max_length]
73
+ gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
74
+ torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
75
+ else:
76
+ input_data = input_text
77
+ text_out, wav, sr = translator.predict(
78
+ input=input_data,
79
+ task_str=task_name,
80
+ tgt_lang=target_language_code,
81
+ src_lang=source_language_code,
82
+ )
83
+ if task_name in ["S2ST", "T2ST"]:
84
+ return (sr, wav.cpu().detach().numpy()), text_out
85
+ else:
86
+ return None, text_out
87
+
88
+
89
+ def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[str, str]:
90
+ return predict(
91
+ task_name="S2ST",
92
+ audio_source="file",
93
+ input_audio_mic=None,
94
+ input_audio_file=input_audio_file,
95
+ input_text=None,
96
+ source_language=None,
97
+ target_language=target_language,
98
+ )
99
+
100
+
101
+ def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[str, str]:
102
+ return predict(
103
+ task_name="S2TT",
104
+ audio_source="file",
105
+ input_audio_mic=None,
106
+ input_audio_file=input_audio_file,
107
+ input_text=None,
108
+ source_language=None,
109
+ target_language=target_language,
110
+ )
111
+
112
+
113
+ def process_t2st_example(input_text: str, source_language: str, target_language: str) -> tuple[str, str]:
114
+ return predict(
115
+ task_name="T2ST",
116
+ audio_source="",
117
+ input_audio_mic=None,
118
+ input_audio_file=None,
119
+ input_text=input_text,
120
+ source_language=source_language,
121
+ target_language=target_language,
122
+ )
123
+
124
+
125
+ def process_t2tt_example(input_text: str, source_language: str, target_language: str) -> tuple[str, str]:
126
+ return predict(
127
+ task_name="T2TT",
128
+ audio_source="",
129
+ input_audio_mic=None,
130
+ input_audio_file=None,
131
+ input_text=input_text,
132
+ source_language=source_language,
133
+ target_language=target_language,
134
+ )
135
+
136
+
137
+ def process_asr_example(input_audio_file: str, target_language: str) -> tuple[str, str]:
138
+ return predict(
139
+ task_name="ASR",
140
+ audio_source="file",
141
+ input_audio_mic=None,
142
+ input_audio_file=input_audio_file,
143
+ input_text=None,
144
+ source_language=None,
145
+ target_language=target_language,
146
+ )
147
+
148
+
149
+ def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
150
+ mic = audio_source == "microphone"
151
+ return (
152
+ gr.update(visible=mic, value=None), # input_audio_mic
153
+ gr.update(visible=not mic, value=None), # input_audio_file
154
+ )
155
+
156
+
157
+ def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
158
+ task_name = task_name.split()[0]
159
+ if task_name == "S2ST":
160
+ return (
161
+ gr.update(visible=True), # audio_box
162
+ gr.update(visible=False), # input_text
163
+ gr.update(visible=False), # source_language
164
+ gr.update(
165
+ visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
166
+ ), # target_language
167
+ )
168
+ elif task_name == "S2TT":
169
+ return (
170
+ gr.update(visible=True), # audio_box
171
+ gr.update(visible=False), # input_text
172
+ gr.update(visible=False), # source_language
173
+ gr.update(
174
+ visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
175
+ ), # target_language
176
+ )
177
+ elif task_name == "T2ST":
178
+ return (
179
+ gr.update(visible=False), # audio_box
180
+ gr.update(visible=True), # input_text
181
+ gr.update(visible=True), # source_language
182
+ gr.update(
183
+ visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
184
+ ), # target_language
185
+ )
186
+ elif task_name == "T2TT":
187
+ return (
188
+ gr.update(visible=False), # audio_box
189
+ gr.update(visible=True), # input_text
190
+ gr.update(visible=True), # source_language
191
+ gr.update(
192
+ visible=True, choices=T2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
193
+ ), # target_language
194
+ )
195
+ elif task_name == "ASR":
196
+ return (
197
+ gr.update(visible=True), # audio_box
198
+ gr.update(visible=False), # input_text
199
+ gr.update(visible=False), # source_language
200
+ gr.update(
201
+ visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
202
+ ), # target_language
203
+ )
204
+ else:
205
+ raise ValueError(f"Unknown task: {task_name}")
206
+
207
+
208
+ def update_output_ui(task_name: str) -> tuple[dict, dict]:
209
+ task_name = task_name.split()[0]
210
+ if task_name in ["S2ST", "T2ST"]:
211
+ return (
212
+ gr.update(visible=True, value=None), # output_audio
213
+ gr.update(value=None), # output_text
214
+ )
215
+ elif task_name in ["S2TT", "T2TT", "ASR"]:
216
+ return (
217
+ gr.update(visible=False, value=None), # output_audio
218
+ gr.update(value=None), # output_text
219
+ )
220
+ else:
221
+ raise ValueError(f"Unknown task: {task_name}")
222
+
223
+
224
+ def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
225
+ task_name = task_name.split()[0]
226
+ return (
227
+ gr.update(visible=task_name == "S2ST"), # s2st_example_row
228
+ gr.update(visible=task_name == "S2TT"), # s2tt_example_row
229
+ gr.update(visible=task_name == "T2ST"), # t2st_example_row
230
+ gr.update(visible=task_name == "T2TT"), # t2tt_example_row
231
+ gr.update(visible=task_name == "ASR"), # asr_example_row
232
+ )
233
+
234
+
235
+ with gr.Blocks(css="style.css") as demo:
236
+ gr.Markdown(DESCRIPTION)
237
+ gr.DuplicateButton(
238
+ value="Duplicate Space for private use",
239
+ elem_id="duplicate-button",
240
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
241
+ )
242
+ with gr.Group():
243
+ task_name = gr.Dropdown(
244
+ label="Task",
245
+ choices=TASK_NAMES,
246
+ value=TASK_NAMES[0],
247
+ )
248
+ with gr.Row():
249
+ source_language = gr.Dropdown(
250
+ label="Source language",
251
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
252
+ value="English",
253
+ visible=False,
254
+ )
255
+ target_language = gr.Dropdown(
256
+ label="Target language",
257
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
258
+ value=DEFAULT_TARGET_LANGUAGE,
259
+ )
260
+ with gr.Row() as audio_box:
261
+ audio_source = gr.Radio(
262
+ label="Audio source",
263
+ choices=["file", "microphone"],
264
+ value="file",
265
+ )
266
+ input_audio_mic = gr.Audio(
267
+ label="Input speech",
268
+ type="filepath",
269
+ source="microphone",
270
+ visible=False,
271
+ )
272
+ input_audio_file = gr.Audio(
273
+ label="Input speech",
274
+ type="filepath",
275
+ source="upload",
276
+ visible=True,
277
+ )
278
+ input_text = gr.Textbox(label="Input text", visible=False)
279
+ btn = gr.Button("Translate")
280
+ with gr.Column():
281
+ output_audio = gr.Audio(
282
+ label="Translated speech",
283
+ autoplay=False,
284
+ streaming=False,
285
+ type="numpy",
286
+ )
287
+ output_text = gr.Textbox(label="Translated text")
288
+
289
+ with gr.Row(visible=True) as s2st_example_row:
290
+ s2st_examples = gr.Examples(
291
+ examples=[
292
+ ["assets/sample_input.mp3", "French"],
293
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
294
+ ["assets/sample_input_2.mp3", "Hindi"],
295
+ ["assets/sample_input_2.mp3", "Spanish"],
296
+ ],
297
+ inputs=[input_audio_file, target_language],
298
+ outputs=[output_audio, output_text],
299
+ fn=process_s2st_example,
300
+ cache_examples=CACHE_EXAMPLES,
301
+ )
302
+ with gr.Row(visible=False) as s2tt_example_row:
303
+ s2tt_examples = gr.Examples(
304
+ examples=[
305
+ ["assets/sample_input.mp3", "French"],
306
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
307
+ ["assets/sample_input_2.mp3", "Hindi"],
308
+ ["assets/sample_input_2.mp3", "Spanish"],
309
+ ],
310
+ inputs=[input_audio_file, target_language],
311
+ outputs=[output_audio, output_text],
312
+ fn=process_s2tt_example,
313
+ cache_examples=CACHE_EXAMPLES,
314
+ )
315
+ with gr.Row(visible=False) as t2st_example_row:
316
+ t2st_examples = gr.Examples(
317
+ examples=[
318
+ ["My favorite animal is the elephant.", "English", "French"],
319
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
320
+ ["Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
321
+ "English", "Hindi"],
322
+ ["Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
323
+ "English", "Spanish"],
324
+ ],
325
+ inputs=[input_text, source_language, target_language],
326
+ outputs=[output_audio, output_text],
327
+ fn=process_t2st_example,
328
+ cache_examples=CACHE_EXAMPLES,
329
+ )
330
+ with gr.Row(visible=False) as t2tt_example_row:
331
+ t2tt_examples = gr.Examples(
332
+ examples=[
333
+ ["My favorite animal is the elephant.", "English", "French"],
334
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
335
+ ["Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
336
+ "English", "Hindi"],
337
+ ["Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
338
+ "English", "Spanish"],
339
+ ],
340
+ inputs=[input_text, source_language, target_language],
341
+ outputs=[output_audio, output_text],
342
+ fn=process_t2tt_example,
343
+ cache_examples=CACHE_EXAMPLES,
344
+ )
345
+ with gr.Row(visible=False) as asr_example_row:
346
+ asr_examples = gr.Examples(
347
+ examples=[
348
+ ["assets/sample_input.mp3", "English"],
349
+ ["assets/sample_input_2.mp3", "English"],
350
+ ],
351
+ inputs=[input_audio_file, target_language],
352
+ outputs=[output_audio, output_text],
353
+ fn=process_asr_example,
354
+ cache_examples=CACHE_EXAMPLES,
355
+ )
356
+
357
+ audio_source.change(
358
+ fn=update_audio_ui,
359
+ inputs=audio_source,
360
+ outputs=[
361
+ input_audio_mic,
362
+ input_audio_file,
363
+ ],
364
+ queue=False,
365
+ api_name=False,
366
+ )
367
+ task_name.change(
368
+ fn=update_input_ui,
369
+ inputs=task_name,
370
+ outputs=[
371
+ audio_box,
372
+ input_text,
373
+ source_language,
374
+ target_language,
375
+ ],
376
+ queue=False,
377
+ api_name=False,
378
+ ).then(
379
+ fn=update_output_ui,
380
+ inputs=task_name,
381
+ outputs=[output_audio, output_text],
382
+ queue=False,
383
+ api_name=False,
384
+ ).then(
385
+ fn=update_example_ui,
386
+ inputs=task_name,
387
+ outputs=[
388
+ s2st_example_row,
389
+ s2tt_example_row,
390
+ t2st_example_row,
391
+ t2tt_example_row,
392
+ asr_example_row,
393
+ ],
394
+ queue=False,
395
+ api_name=False,
396
+ )
397
+
398
+ btn.click(
399
+ fn=predict,
400
+ inputs=[
401
+ task_name,
402
+ audio_source,
403
+ input_audio_mic,
404
+ input_audio_file,
405
+ input_text,
406
+ source_language,
407
+ target_language,
408
+ ],
409
+ outputs=[output_audio, output_text],
410
+ api_name="run",
411
+ )
412
+ demo.queue(max_size=50).launch()
assets/sample_input.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:982369687f05bf8fcd6923c4ffcccda0fcce92f44eceae5a9d00a431f07ea87b
3
+ size 10272
assets/sample_input_2.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a505a4641e3f5f0ddec9508832793aa20e63d2545530b66bc04a9bd19a742e6
3
+ size 30624
lang_list.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language dict
2
+ language_code_to_name = {
3
+ "afr": "Afrikaans",
4
+ "amh": "Amharic",
5
+ "arb": "Modern Standard Arabic",
6
+ "ary": "Moroccan Arabic",
7
+ "arz": "Egyptian Arabic",
8
+ "asm": "Assamese",
9
+ "ast": "Asturian",
10
+ "azj": "North Azerbaijani",
11
+ "bel": "Belarusian",
12
+ "ben": "Bengali",
13
+ "bos": "Bosnian",
14
+ "bul": "Bulgarian",
15
+ "cat": "Catalan",
16
+ "ceb": "Cebuano",
17
+ "ces": "Czech",
18
+ "ckb": "Central Kurdish",
19
+ "cmn": "Mandarin Chinese",
20
+ "cym": "Welsh",
21
+ "dan": "Danish",
22
+ "deu": "German",
23
+ "ell": "Greek",
24
+ "eng": "English",
25
+ "est": "Estonian",
26
+ "eus": "Basque",
27
+ "fin": "Finnish",
28
+ "fra": "French",
29
+ "gaz": "West Central Oromo",
30
+ "gle": "Irish",
31
+ "glg": "Galician",
32
+ "guj": "Gujarati",
33
+ "heb": "Hebrew",
34
+ "hin": "Hindi",
35
+ "hrv": "Croatian",
36
+ "hun": "Hungarian",
37
+ "hye": "Armenian",
38
+ "ibo": "Igbo",
39
+ "ind": "Indonesian",
40
+ "isl": "Icelandic",
41
+ "ita": "Italian",
42
+ "jav": "Javanese",
43
+ "jpn": "Japanese",
44
+ "kam": "Kamba",
45
+ "kan": "Kannada",
46
+ "kat": "Georgian",
47
+ "kaz": "Kazakh",
48
+ "kea": "Kabuverdianu",
49
+ "khk": "Halh Mongolian",
50
+ "khm": "Khmer",
51
+ "kir": "Kyrgyz",
52
+ "kor": "Korean",
53
+ "lao": "Lao",
54
+ "lit": "Lithuanian",
55
+ "ltz": "Luxembourgish",
56
+ "lug": "Ganda",
57
+ "luo": "Luo",
58
+ "lvs": "Standard Latvian",
59
+ "mai": "Maithili",
60
+ "mal": "Malayalam",
61
+ "mar": "Marathi",
62
+ "mkd": "Macedonian",
63
+ "mlt": "Maltese",
64
+ "mni": "Meitei",
65
+ "mya": "Burmese",
66
+ "nld": "Dutch",
67
+ "nno": "Norwegian Nynorsk",
68
+ "nob": "Norwegian Bokm\u00e5l",
69
+ "npi": "Nepali",
70
+ "nya": "Nyanja",
71
+ "oci": "Occitan",
72
+ "ory": "Odia",
73
+ "pan": "Punjabi",
74
+ "pbt": "Southern Pashto",
75
+ "pes": "Western Persian",
76
+ "pol": "Polish",
77
+ "por": "Portuguese",
78
+ "ron": "Romanian",
79
+ "rus": "Russian",
80
+ "slk": "Slovak",
81
+ "slv": "Slovenian",
82
+ "sna": "Shona",
83
+ "snd": "Sindhi",
84
+ "som": "Somali",
85
+ "spa": "Spanish",
86
+ "srp": "Serbian",
87
+ "swe": "Swedish",
88
+ "swh": "Swahili",
89
+ "tam": "Tamil",
90
+ "tel": "Telugu",
91
+ "tgk": "Tajik",
92
+ "tgl": "Tagalog",
93
+ "tha": "Thai",
94
+ "tur": "Turkish",
95
+ "ukr": "Ukrainian",
96
+ "urd": "Urdu",
97
+ "uzn": "Northern Uzbek",
98
+ "vie": "Vietnamese",
99
+ "xho": "Xhosa",
100
+ "yor": "Yoruba",
101
+ "yue": "Cantonese",
102
+ "zlm": "Colloquial Malay",
103
+ "zsm": "Standard Malay",
104
+ "zul": "Zulu",
105
+ }
106
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
107
+
108
+ # Source langs: S2ST / S2TT / ASR don't need source lang
109
+ # T2TT / T2ST use this
110
+ text_source_language_codes = [
111
+ "afr",
112
+ "amh",
113
+ "arb",
114
+ "ary",
115
+ "arz",
116
+ "asm",
117
+ "azj",
118
+ "bel",
119
+ "ben",
120
+ "bos",
121
+ "bul",
122
+ "cat",
123
+ "ceb",
124
+ "ces",
125
+ "ckb",
126
+ "cmn",
127
+ "cym",
128
+ "dan",
129
+ "deu",
130
+ "ell",
131
+ "eng",
132
+ "est",
133
+ "eus",
134
+ "fin",
135
+ "fra",
136
+ "gaz",
137
+ "gle",
138
+ "glg",
139
+ "guj",
140
+ "heb",
141
+ "hin",
142
+ "hrv",
143
+ "hun",
144
+ "hye",
145
+ "ibo",
146
+ "ind",
147
+ "isl",
148
+ "ita",
149
+ "jav",
150
+ "jpn",
151
+ "kan",
152
+ "kat",
153
+ "kaz",
154
+ "khk",
155
+ "khm",
156
+ "kir",
157
+ "kor",
158
+ "lao",
159
+ "lit",
160
+ "lug",
161
+ "luo",
162
+ "lvs",
163
+ "mai",
164
+ "mal",
165
+ "mar",
166
+ "mkd",
167
+ "mlt",
168
+ "mni",
169
+ "mya",
170
+ "nld",
171
+ "nno",
172
+ "nob",
173
+ "npi",
174
+ "nya",
175
+ "ory",
176
+ "pan",
177
+ "pbt",
178
+ "pes",
179
+ "pol",
180
+ "por",
181
+ "ron",
182
+ "rus",
183
+ "slk",
184
+ "slv",
185
+ "sna",
186
+ "snd",
187
+ "som",
188
+ "spa",
189
+ "srp",
190
+ "swe",
191
+ "swh",
192
+ "tam",
193
+ "tel",
194
+ "tgk",
195
+ "tgl",
196
+ "tha",
197
+ "tur",
198
+ "ukr",
199
+ "urd",
200
+ "uzn",
201
+ "vie",
202
+ "yor",
203
+ "yue",
204
+ "zsm",
205
+ "zul",
206
+ ]
207
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
208
+
209
+ # Target langs:
210
+ # S2ST / T2ST
211
+ s2st_target_language_codes = [
212
+ "eng",
213
+ "arb",
214
+ "ben",
215
+ "cat",
216
+ "ces",
217
+ "cmn",
218
+ "cym",
219
+ "dan",
220
+ "deu",
221
+ "est",
222
+ "fin",
223
+ "fra",
224
+ "hin",
225
+ "ind",
226
+ "ita",
227
+ "jpn",
228
+ "kor",
229
+ "mlt",
230
+ "nld",
231
+ "pes",
232
+ "pol",
233
+ "por",
234
+ "ron",
235
+ "rus",
236
+ "slk",
237
+ "spa",
238
+ "swe",
239
+ "swh",
240
+ "tel",
241
+ "tgl",
242
+ "tha",
243
+ "tur",
244
+ "ukr",
245
+ "urd",
246
+ "uzn",
247
+ "vie",
248
+ ]
249
+ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
250
+
251
+ # S2TT / ASR
252
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
253
+ # T2TT
254
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==3.40.1
2
+ huggingface_hub==0.16.4
3
+ torch==2.0.1
4
+ torchaudio==2.0.2
style.css ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ }
4
+
5
+ #duplicate-button {
6
+ margin: auto;
7
+ color: #fff;
8
+ background: #1565c0;
9
+ border-radius: 100vh;
10
+ }
11
+
12
+ #component-0 {
13
+ max-width: 730px;
14
+ margin: auto;
15
+ padding-top: 1.5rem;
16
+ }