File size: 6,878 Bytes
59619e5
 
 
 
 
 
 
 
 
 
 
d754ac0
59619e5
8223a0d
 
d754ac0
59619e5
d754ac0
59619e5
d754ac0
 
 
59619e5
 
 
7ae98ba
 
 
 
 
 
 
 
59619e5
 
d754ac0
 
 
 
 
59619e5
 
 
 
 
d754ac0
 
 
 
 
59619e5
 
 
 
 
d754ac0
 
 
 
 
59619e5
 
 
 
 
d754ac0
 
 
 
 
59619e5
 
 
ed26513
 
d754ac0
 
 
 
 
ed26513
 
 
59619e5
 
d754ac0
 
 
 
 
 
 
59619e5
 
 
d754ac0
 
 
 
 
59619e5
 
 
 
 
d754ac0
 
 
 
 
59619e5
 
 
 
 
d754ac0
 
 
ed26513
 
 
 
59619e5
d754ac0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9293b9d
 
 
 
 
 
 
 
0f68bbb
 
 
 
 
 
 
 
59619e5
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import gradio as gr
import pandas as pd
from css_html_js import custom_css

TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malaysian Speech-to-Text Leaderboard</h1>"""

INTRODUCTION_TEXT = """
πŸ“ The πŸ‡²πŸ‡Ύ Malaysian Speech-to-Text Leaderboard aims to track, rank and evaluate Malaysian Speech-to-Text models. All notebooks at https://github.com/mesolitica/malaysian-stt-benchmarks

## Dataset

πŸ“ˆ We evaluate models based on 3 datasets,

1. Malaya-Speech test set, Malay language, https://huggingface.co/datasets/mesolitica/speech-test-set/tree/main/malaya-speech
2. Fleurs MS-MY test set, Malay language, https://huggingface.co/datasets/mesolitica/speech-test-set/tree/main/fleurs-ms-my
3. IMDA TTS first 700 audio files, English language but with Manglish slang, https://huggingface.co/datasets/mesolitica/IMDA-TTS

## Heavy postprocess test set

1. We filtered test set that contain numbers because malaya-speech transducer trained on normalized numbers.
2. We lower case because malaya-speech transducer trained on lower case.
3. We removed punctuation because malaya-speech transducer trained without punctuation.
"""

open_source = [
    {
        'model': 'goodtape.io',
        'model size FP16 (MB)': None,
        'Malaya-Speech test CER': 0.09504487340205486,
        'Malaya-Speech test WER': 0.1691902868373457,
        'Fleurs MY-MS CER': 0.03643102801583697,
        'Fleurs MY-MS WER': 0.08672758155453257,
    },
    {
        'model': 'openai/whisper-large-v3',
        'model size FP16 (MB)': 3090,
        'Malaya-Speech test CER': 0.0349251317825172,
        'Malaya-Speech test WER': 0.1032828282828283,
        'Fleurs MY-MS CER': 0.026055551396846878,
        'Fleurs MY-MS WER': 0.07652049926522007,
        'IMDA TTS CER': 0.016648493852990828,
        'IMDA TTS WER': 0.0386282289139432,
    },
    {
        'model': 'openai/whisper-medium',
        'model size FP16 (MB)': 1530,
        'Malaya-Speech test CER': 0.05064920144820262,
        'Malaya-Speech test WER': 0.17534205321090568,
        'Fleurs MY-MS CER': 0.04366882208520179,
        'Fleurs MY-MS WER': 0.13546055192128273,
        'IMDA TTS CER': 0.02065587879424904,
        'IMDA TTS WER': 0.047277690563404855,
    },
    {
        'model': 'openai/whisper-small',
        'model size FP16 (MB)': 483.5,
        'Malaya-Speech test CER': 0.07485209857268262,
        'Malaya-Speech test WER': 0.25748516055893106,
        'Fleurs MY-MS CER': 0.06781078047622793,
        'Fleurs MY-MS WER': 0.21953142859857497,
        'IMDA TTS CER': 0.024812471688517194,
        'IMDA TTS WER': 0.058901277294134434,
    },
    {
        'model': 'openai/whisper-base',
        'model size FP16 (MB)': 145,
        'Malaya-Speech test CER': 0.3574879236610538,
        'Malaya-Speech test WER': 0.8303456599563157,
        'Fleurs MY-MS CER': 0.1319124653794061,
        'Fleurs MY-MS WER': 0.40499286081235003,
        'IMDA TTS CER': 0.03914533450681607,
        'IMDA TTS WER': 0.08951682444539587,
    },
    {
        'model': 'openai/whisper-tiny',
        'model size FP16 (MB)': 75.5,
        'Malaya-Speech test CER': 0.26941094281472105,
        'Malaya-Speech test WER': 0.7414099751189915,
        'Fleurs MY-MS CER': 0.38749733168917505,
        'Fleurs MY-MS WER': 0.812253445128297,
        'IMDA TTS CER': 0.048805770734828904,
        'IMDA TTS WER': 0.11150629529200957,
    },
    {
        'model': 'mesolitica/malaysian-whisper-medium',
        'model size FP16 (MB)': 1530,
        'Malaya-Speech test CER': 0.05622483776367814,
        'Malaya-Speech test WER': 0.14406629724252673,
        'Fleurs MY-MS CER': 0.025543266604368554,
        'Fleurs MY-MS WER': 0.07940219915492629,
        'IMDA TTS CER': 0.01971214262944062,
        'IMDA TTS WER': 0.047223078508792794,
    },
    {
        'model': 'mesolitica/malaysian-whisper-small',
        'model size FP16 (MB)': 483.5,
        'Malaya-Speech test CER': 0.049162419174983304,
        'Malaya-Speech test WER': 0.15926901346983313,
        'Fleurs MY-MS CER': 0.035517572531147,
        'Fleurs MY-MS WER': 0.10938718963023729,
        'IMDA TTS CER': 0.024228721439634855,
        'IMDA TTS WER': 0.05546294182008469,
    },
    {
        'model': 'mesolitica/malaysian-whisper-base',
        'model size FP16 (MB)': 145,
        'Malaya-Speech test CER': 0.07242006488452603,
        'Malaya-Speech test WER': 0.22081683495617924,
        'Fleurs MY-MS CER': 0.06639564802362424,
        'Fleurs MY-MS WER': 0.19675812232021192,
        'IMDA TTS CER': 0.03982418421412676,
        'IMDA TTS WER': 0.08917690642690643,
    },
    {
        'model': 'mesolitica/malaysian-whisper-tiny',
        'model size FP16 (MB)': 75.5,
        'Malaya-Speech test CER': 0.09423990117534763,
        'Malaya-Speech test WER': 0.295029492365558,
        'Fleurs MY-MS CER': 0.13390519685940314,
        'Fleurs MY-MS WER': 0.3461808122686204,
        'IMDA TTS CER': 0.07957313474501154,
        'IMDA TTS WER': 0.1421708648494363,
    },
    {
        'model': 'mesolitica/conformer-large-malay-whisper',
        'model size FP16 (MB)': 206.5,
        'Malaya-Speech test CER': 0.025933167255719317,
        'Malaya-Speech test WER': 0.0912131356803488,
        'Fleurs MY-MS CER': 0.02548791948171514,
        'Fleurs MY-MS WER': 0.08376713097429746,
    },
    {
        'model': 'mesolitica/conformer-medium-malay-whisper',
        'model size FP16 (MB)': 121.5,
        'Malaya-Speech test CER': 0.024955598713609053,
        'Malaya-Speech test WER': 0.09315638444736804,
        'Fleurs MY-MS CER': 0.029205645523910067,
        'Fleurs MY-MS WER': 0.09253131557833799,
    },
    {
        'model': 'mesolitica/conformer-medium-mixed',
        'model size FP16 (MB)': 121.5,
        'Malaya-Speech test CER': 0.034618711056551774,
        'Malaya-Speech test WER': 0.11179440626161938,
        'Fleurs MY-MS CER': 0.032894184549728075,
        'Fleurs MY-MS WER': 0.1026977414887425,
    },
    {
        'model': 'mesolitica/conformer-tiny-ctc + mesolitica/kenlm-pseudolabel-whisper-large-v3',
        'model size FP16 (MB)': 7.9,
        'Malaya-Speech test CER': 0.0612581761581601,
        'Malaya-Speech test WER': 0.21302693966628394,
        'Fleurs MY-MS CER': 0.07573301800412188,
        'Fleurs MY-MS WER': 0.2527434609577528,
    },
    {
        'model': 'mesolitica/conformer-12M-ctc + mesolitica/kenlm-pseudolabel-whisper-large-v3',
        'model size FP16 (MB)': 24.2,
        'Malaya-Speech test CER': 0.06941749946814912,
        'Malaya-Speech test WER': 0.22261096523391607,
        'Fleurs MY-MS CER': 0.07657934690019219,
        'Fleurs MY-MS WER': 0.263075623142674,
    },
]

data = pd.DataFrame(open_source)

demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
    gr.DataFrame(data)

demo.launch()