Spaces:
Runtime error
Runtime error
ahmedghani
commited on
Commit
•
f414514
1
Parent(s):
01a2749
fixed mp3 format issue
Browse files- app.py +127 -108
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,119 +1,138 @@
|
|
|
|
1 |
import whisper
|
2 |
import torch
|
|
|
3 |
import torchaudio
|
4 |
import streamlit as st
|
5 |
-
|
6 |
LANGUAGES = {
|
7 |
-
"
|
8 |
-
"
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"
|
14 |
-
"
|
15 |
-
"
|
16 |
-
"
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"
|
20 |
-
"
|
21 |
-
"
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"
|
36 |
-
"
|
37 |
-
"
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"
|
44 |
-
"
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
"
|
51 |
-
"
|
52 |
-
"
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
58 |
-
"
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"
|
63 |
-
"
|
64 |
-
"
|
65 |
-
"
|
66 |
-
"
|
67 |
-
"
|
68 |
-
"
|
69 |
-
"
|
70 |
-
"
|
71 |
-
"
|
72 |
-
"
|
73 |
-
"
|
74 |
-
"
|
75 |
-
"
|
76 |
-
"
|
77 |
-
"
|
78 |
-
"
|
79 |
-
"
|
80 |
-
"
|
81 |
-
"
|
82 |
-
"
|
83 |
-
"
|
84 |
-
"
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"haitian creole"
|
88 |
-
"
|
89 |
-
"
|
90 |
-
"
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
94 |
-
"
|
95 |
-
"
|
96 |
-
"
|
97 |
-
"
|
98 |
-
"
|
99 |
-
"
|
100 |
-
"
|
101 |
-
"
|
102 |
-
"
|
103 |
-
"
|
104 |
-
"
|
105 |
-
"
|
106 |
}
|
107 |
|
108 |
def decode(model, mel, options):
|
109 |
result = whisper.decode(model, mel, options)
|
110 |
return result.text
|
111 |
|
112 |
-
def load_audio(
|
113 |
-
|
114 |
-
if
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
def detect_language(model, mel):
|
119 |
_, probs = model.detect_language(mel)
|
@@ -136,7 +155,7 @@ def main():
|
|
136 |
st.sidebar.write(f"Model: {model_selection+' (Multilingual)' if not en_model_selection else model_selection + ' (English only)'}")
|
137 |
|
138 |
if st.sidebar.checkbox("Show supported languages", value=False):
|
139 |
-
st.sidebar.info(list(LANGUAGES.
|
140 |
st.sidebar.title("Options")
|
141 |
|
142 |
beam_size = st.sidebar.slider("Beam Size", min_value=1, max_value=10, value=5)
|
@@ -151,7 +170,7 @@ def main():
|
|
151 |
audio_file = st.file_uploader("Upload Audio", type=["wav", "mp3", "flac"])
|
152 |
|
153 |
if audio_file is not None:
|
154 |
-
st.audio(audio_file, format=
|
155 |
with st.spinner("Loading model..."):
|
156 |
model = whisper.load_model(model_selection)
|
157 |
model = model.to("cpu") if not torch.cuda.is_available() else model.to("cuda")
|
@@ -164,7 +183,7 @@ def main():
|
|
164 |
if not en_model_selection:
|
165 |
with st.spinner("Detecting language..."):
|
166 |
language = detect_language(model, mel)
|
167 |
-
st.markdown(f"Detected Language: {language}")
|
168 |
else:
|
169 |
language = "en"
|
170 |
configuration = {"beam_size": beam_size, "fp16": fp16, "task": task, "language": language}
|
|
|
1 |
+
import io
|
2 |
import whisper
|
3 |
import torch
|
4 |
+
import ffmpeg
|
5 |
import torchaudio
|
6 |
import streamlit as st
|
|
|
7 |
LANGUAGES = {
|
8 |
+
"en":"english",
|
9 |
+
"zh":"chinese",
|
10 |
+
"de":"german",
|
11 |
+
"es":"spanish",
|
12 |
+
"ru":"russian",
|
13 |
+
"ko":"korean",
|
14 |
+
"fr":"french",
|
15 |
+
"ja":"japanese",
|
16 |
+
"pt":"portuguese",
|
17 |
+
"tr":"turkish",
|
18 |
+
"pl":"polish",
|
19 |
+
"ca":"catalan",
|
20 |
+
"nl":"dutch",
|
21 |
+
"ar":"arabic",
|
22 |
+
"sv":"swedish",
|
23 |
+
"it":"italian",
|
24 |
+
"id":"indonesian",
|
25 |
+
"hi":"hindi",
|
26 |
+
"fi":"finnish",
|
27 |
+
"vi":"vietnamese",
|
28 |
+
"iw":"hebrew",
|
29 |
+
"uk":"ukrainian",
|
30 |
+
"el":"greek",
|
31 |
+
"ms":"malay",
|
32 |
+
"cs":"czech",
|
33 |
+
"ro":"romanian",
|
34 |
+
"da":"danish",
|
35 |
+
"hu":"hungarian",
|
36 |
+
"ta":"tamil",
|
37 |
+
"no":"norwegian",
|
38 |
+
"th":"thai",
|
39 |
+
"ur":"urdu",
|
40 |
+
"hr":"croatian",
|
41 |
+
"bg":"bulgarian",
|
42 |
+
"lt":"lithuanian",
|
43 |
+
"la":"latin",
|
44 |
+
"mi":"maori",
|
45 |
+
"ml":"malayalam",
|
46 |
+
"cy":"welsh",
|
47 |
+
"sk":"slovak",
|
48 |
+
"te":"telugu",
|
49 |
+
"fa":"persian",
|
50 |
+
"lv":"latvian",
|
51 |
+
"bn":"bengali",
|
52 |
+
"sr":"serbian",
|
53 |
+
"az":"azerbaijani",
|
54 |
+
"sl":"slovenian",
|
55 |
+
"kn":"kannada",
|
56 |
+
"et":"estonian",
|
57 |
+
"mk":"macedonian",
|
58 |
+
"br":"breton",
|
59 |
+
"eu":"basque",
|
60 |
+
"is":"icelandic",
|
61 |
+
"hy":"armenian",
|
62 |
+
"ne":"nepali",
|
63 |
+
"mn":"mongolian",
|
64 |
+
"bs":"bosnian",
|
65 |
+
"kk":"kazakh",
|
66 |
+
"sq":"albanian",
|
67 |
+
"sw":"swahili",
|
68 |
+
"gl":"galician",
|
69 |
+
"mr":"marathi",
|
70 |
+
"pa":"punjabi",
|
71 |
+
"si":"sinhala",
|
72 |
+
"km":"khmer",
|
73 |
+
"sn":"shona",
|
74 |
+
"yo":"yoruba",
|
75 |
+
"so":"somali",
|
76 |
+
"af":"afrikaans",
|
77 |
+
"oc":"occitan",
|
78 |
+
"ka":"georgian",
|
79 |
+
"be":"belarusian",
|
80 |
+
"tg":"tajik",
|
81 |
+
"sd":"sindhi",
|
82 |
+
"gu":"gujarati",
|
83 |
+
"am":"amharic",
|
84 |
+
"yi":"yiddish",
|
85 |
+
"lo":"lao",
|
86 |
+
"uz":"uzbek",
|
87 |
+
"fo":"faroese",
|
88 |
+
"ht":"haitian creole",
|
89 |
+
"ps":"pashto",
|
90 |
+
"tk":"turkmen",
|
91 |
+
"nn":"nynorsk",
|
92 |
+
"mt":"maltese",
|
93 |
+
"sa":"sanskrit",
|
94 |
+
"lb":"luxembourgish",
|
95 |
+
"my":"myanmar",
|
96 |
+
"bo":"tibetan",
|
97 |
+
"tl":"tagalog",
|
98 |
+
"mg":"malagasy",
|
99 |
+
"as":"assamese",
|
100 |
+
"tt":"tatar",
|
101 |
+
"haw":"hawaiian",
|
102 |
+
"ln":"lingala",
|
103 |
+
"ha":"hausa",
|
104 |
+
"ba":"bashkir",
|
105 |
+
"jw":"javanese",
|
106 |
+
"su":"sundanese",
|
107 |
}
|
108 |
|
109 |
def decode(model, mel, options):
|
110 |
result = whisper.decode(model, mel, options)
|
111 |
return result.text
|
112 |
|
113 |
+
def load_audio(audio):
|
114 |
+
print(audio.type)
|
115 |
+
if audio.type == "audio/wav" or audio.type == "audio/flac":
|
116 |
+
wave, sr = torchaudio.load(audio)
|
117 |
+
if sr != 16000:
|
118 |
+
wave = torchaudio.transforms.Resample(sr, 16000)(wave)
|
119 |
+
return wave.squeeze(0)
|
120 |
+
|
121 |
+
elif audio.type == "audio/mpeg":
|
122 |
+
audio = audio.read()
|
123 |
+
audio, _ = (ffmpeg
|
124 |
+
.input('pipe:0')
|
125 |
+
.output('pipe:1', format='wav', acodec='pcm_s16le', ac=1, ar='16k')
|
126 |
+
.run(capture_stdout=True, input=audio)
|
127 |
+
)
|
128 |
+
audio = io.BytesIO(audio)
|
129 |
+
wave, sr = torchaudio.load(audio)
|
130 |
+
if sr != 16000:
|
131 |
+
wave = torchaudio.transforms.Resample(sr, 16000)(wave)
|
132 |
+
return wave.squeeze(0)
|
133 |
+
|
134 |
+
else:
|
135 |
+
st.error("Unsupported audio format")
|
136 |
|
137 |
def detect_language(model, mel):
|
138 |
_, probs = model.detect_language(mel)
|
|
|
155 |
st.sidebar.write(f"Model: {model_selection+' (Multilingual)' if not en_model_selection else model_selection + ' (English only)'}")
|
156 |
|
157 |
if st.sidebar.checkbox("Show supported languages", value=False):
|
158 |
+
st.sidebar.info(list(LANGUAGES.values()))
|
159 |
st.sidebar.title("Options")
|
160 |
|
161 |
beam_size = st.sidebar.slider("Beam Size", min_value=1, max_value=10, value=5)
|
|
|
170 |
audio_file = st.file_uploader("Upload Audio", type=["wav", "mp3", "flac"])
|
171 |
|
172 |
if audio_file is not None:
|
173 |
+
st.audio(audio_file, format=audio_file.type)
|
174 |
with st.spinner("Loading model..."):
|
175 |
model = whisper.load_model(model_selection)
|
176 |
model = model.to("cpu") if not torch.cuda.is_available() else model.to("cuda")
|
|
|
183 |
if not en_model_selection:
|
184 |
with st.spinner("Detecting language..."):
|
185 |
language = detect_language(model, mel)
|
186 |
+
st.markdown(f"Detected Language: {LANGUAGES[language]} ({language})")
|
187 |
else:
|
188 |
language = "en"
|
189 |
configuration = {"beam_size": beam_size, "fp16": fp16, "task": task, "language": language}
|
requirements.txt
CHANGED
@@ -3,6 +3,7 @@ numpy
|
|
3 |
torch
|
4 |
torchaudio
|
5 |
tqdm
|
|
|
6 |
more-itertools
|
7 |
transformers>=4.19.0
|
8 |
ffmpeg-python==0.2.0
|
|
|
3 |
torch
|
4 |
torchaudio
|
5 |
tqdm
|
6 |
+
ffmpeg-python
|
7 |
more-itertools
|
8 |
transformers>=4.19.0
|
9 |
ffmpeg-python==0.2.0
|