ahmedghani commited on
Commit
f414514
1 Parent(s): 01a2749

fixed mp3 format issue

Browse files
Files changed (2) hide show
  1. app.py +127 -108
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,119 +1,138 @@
 
1
  import whisper
2
  import torch
 
3
  import torchaudio
4
  import streamlit as st
5
-
6
  LANGUAGES = {
7
- "english":"en",
8
- "chinese":"zh",
9
- "german":"de",
10
- "spanish":"es",
11
- "russian":"ru",
12
- "korean":"ko",
13
- "french":"fr",
14
- "japanese":"ja",
15
- "portuguese":"pt",
16
- "turkish":"tr",
17
- "polish":"pl",
18
- "catalan":"ca",
19
- "dutch":"nl",
20
- "arabic":"ar",
21
- "swedish":"sv",
22
- "italian":"it",
23
- "indonesian":"id",
24
- "hindi":"hi",
25
- "finnish":"fi",
26
- "vietnamese":"vi",
27
- "hebrew":"iw",
28
- "ukrainian":"uk",
29
- "greek":"el",
30
- "malay":"ms",
31
- "czech":"cs",
32
- "romanian":"ro",
33
- "danish":"da",
34
- "hungarian":"hu",
35
- "tamil":"ta",
36
- "norwegian":"no",
37
- "thai":"th",
38
- "urdu":"ur",
39
- "croatian":"hr",
40
- "bulgarian":"bg",
41
- "lithuanian":"lt",
42
- "latin":"la",
43
- "maori":"mi",
44
- "malayalam":"ml",
45
- "welsh":"cy",
46
- "slovak":"sk",
47
- "telugu":"te",
48
- "persian":"fa",
49
- "latvian":"lv",
50
- "bengali":"bn",
51
- "serbian":"sr",
52
- "azerbaijani":"az",
53
- "slovenian":"sl",
54
- "kannada":"kn",
55
- "estonian":"et",
56
- "macedonian":"mk",
57
- "breton":"br",
58
- "basque":"eu",
59
- "icelandic":"is",
60
- "armenian":"hy",
61
- "nepali":"ne",
62
- "mongolian":"mn",
63
- "bosnian":"bs",
64
- "kazakh":"kk",
65
- "albanian":"sq",
66
- "swahili":"sw",
67
- "galician":"gl",
68
- "marathi":"mr",
69
- "punjabi":"pa",
70
- "sinhala":"si",
71
- "khmer":"km",
72
- "shona":"sn",
73
- "yoruba":"yo",
74
- "somali":"so",
75
- "afrikaans":"af",
76
- "occitan":"oc",
77
- "georgian":"ka",
78
- "belarusian":"be",
79
- "tajik":"tg",
80
- "sindhi":"sd",
81
- "gujarati":"gu",
82
- "amharic":"am",
83
- "yiddish":"yi",
84
- "lao":"lo",
85
- "uzbek":"uz",
86
- "faroese":"fo",
87
- "haitian creole":"ht",
88
- "pashto":"ps",
89
- "turkmen":"tk",
90
- "nynorsk":"nn",
91
- "maltese":"mt",
92
- "sanskrit":"sa",
93
- "luxembourgish":"lb",
94
- "myanmar":"my",
95
- "tibetan":"bo",
96
- "tagalog":"tl",
97
- "malagasy":"mg",
98
- "assamese":"as",
99
- "tatar":"tt",
100
- "hawaiian":"haw",
101
- "lingala":"ln",
102
- "hausa":"ha",
103
- "bashkir":"ba",
104
- "javanese":"jw",
105
- "sundanese":"su",
106
  }
107
 
108
  def decode(model, mel, options):
109
  result = whisper.decode(model, mel, options)
110
  return result.text
111
 
112
- def load_audio(path):
113
- waveform, sample_rate = torchaudio.load(path)
114
- if sample_rate != 16000:
115
- waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
116
- return waveform.squeeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def detect_language(model, mel):
119
  _, probs = model.detect_language(mel)
@@ -136,7 +155,7 @@ def main():
136
  st.sidebar.write(f"Model: {model_selection+' (Multilingual)' if not en_model_selection else model_selection + ' (English only)'}")
137
 
138
  if st.sidebar.checkbox("Show supported languages", value=False):
139
- st.sidebar.info(list(LANGUAGES.keys()))
140
  st.sidebar.title("Options")
141
 
142
  beam_size = st.sidebar.slider("Beam Size", min_value=1, max_value=10, value=5)
@@ -151,7 +170,7 @@ def main():
151
  audio_file = st.file_uploader("Upload Audio", type=["wav", "mp3", "flac"])
152
 
153
  if audio_file is not None:
154
- st.audio(audio_file, format='audio/ogg')
155
  with st.spinner("Loading model..."):
156
  model = whisper.load_model(model_selection)
157
  model = model.to("cpu") if not torch.cuda.is_available() else model.to("cuda")
@@ -164,7 +183,7 @@ def main():
164
  if not en_model_selection:
165
  with st.spinner("Detecting language..."):
166
  language = detect_language(model, mel)
167
- st.markdown(f"Detected Language: {language}")
168
  else:
169
  language = "en"
170
  configuration = {"beam_size": beam_size, "fp16": fp16, "task": task, "language": language}
 
1
+ import io
2
  import whisper
3
  import torch
4
+ import ffmpeg
5
  import torchaudio
6
  import streamlit as st
 
7
  LANGUAGES = {
8
+ "en":"english",
9
+ "zh":"chinese",
10
+ "de":"german",
11
+ "es":"spanish",
12
+ "ru":"russian",
13
+ "ko":"korean",
14
+ "fr":"french",
15
+ "ja":"japanese",
16
+ "pt":"portuguese",
17
+ "tr":"turkish",
18
+ "pl":"polish",
19
+ "ca":"catalan",
20
+ "nl":"dutch",
21
+ "ar":"arabic",
22
+ "sv":"swedish",
23
+ "it":"italian",
24
+ "id":"indonesian",
25
+ "hi":"hindi",
26
+ "fi":"finnish",
27
+ "vi":"vietnamese",
28
+ "iw":"hebrew",
29
+ "uk":"ukrainian",
30
+ "el":"greek",
31
+ "ms":"malay",
32
+ "cs":"czech",
33
+ "ro":"romanian",
34
+ "da":"danish",
35
+ "hu":"hungarian",
36
+ "ta":"tamil",
37
+ "no":"norwegian",
38
+ "th":"thai",
39
+ "ur":"urdu",
40
+ "hr":"croatian",
41
+ "bg":"bulgarian",
42
+ "lt":"lithuanian",
43
+ "la":"latin",
44
+ "mi":"maori",
45
+ "ml":"malayalam",
46
+ "cy":"welsh",
47
+ "sk":"slovak",
48
+ "te":"telugu",
49
+ "fa":"persian",
50
+ "lv":"latvian",
51
+ "bn":"bengali",
52
+ "sr":"serbian",
53
+ "az":"azerbaijani",
54
+ "sl":"slovenian",
55
+ "kn":"kannada",
56
+ "et":"estonian",
57
+ "mk":"macedonian",
58
+ "br":"breton",
59
+ "eu":"basque",
60
+ "is":"icelandic",
61
+ "hy":"armenian",
62
+ "ne":"nepali",
63
+ "mn":"mongolian",
64
+ "bs":"bosnian",
65
+ "kk":"kazakh",
66
+ "sq":"albanian",
67
+ "sw":"swahili",
68
+ "gl":"galician",
69
+ "mr":"marathi",
70
+ "pa":"punjabi",
71
+ "si":"sinhala",
72
+ "km":"khmer",
73
+ "sn":"shona",
74
+ "yo":"yoruba",
75
+ "so":"somali",
76
+ "af":"afrikaans",
77
+ "oc":"occitan",
78
+ "ka":"georgian",
79
+ "be":"belarusian",
80
+ "tg":"tajik",
81
+ "sd":"sindhi",
82
+ "gu":"gujarati",
83
+ "am":"amharic",
84
+ "yi":"yiddish",
85
+ "lo":"lao",
86
+ "uz":"uzbek",
87
+ "fo":"faroese",
88
+ "ht":"haitian creole",
89
+ "ps":"pashto",
90
+ "tk":"turkmen",
91
+ "nn":"nynorsk",
92
+ "mt":"maltese",
93
+ "sa":"sanskrit",
94
+ "lb":"luxembourgish",
95
+ "my":"myanmar",
96
+ "bo":"tibetan",
97
+ "tl":"tagalog",
98
+ "mg":"malagasy",
99
+ "as":"assamese",
100
+ "tt":"tatar",
101
+ "haw":"hawaiian",
102
+ "ln":"lingala",
103
+ "ha":"hausa",
104
+ "ba":"bashkir",
105
+ "jw":"javanese",
106
+ "su":"sundanese",
107
  }
108
 
109
  def decode(model, mel, options):
110
  result = whisper.decode(model, mel, options)
111
  return result.text
112
 
113
+ def load_audio(audio):
114
+ print(audio.type)
115
+ if audio.type == "audio/wav" or audio.type == "audio/flac":
116
+ wave, sr = torchaudio.load(audio)
117
+ if sr != 16000:
118
+ wave = torchaudio.transforms.Resample(sr, 16000)(wave)
119
+ return wave.squeeze(0)
120
+
121
+ elif audio.type == "audio/mpeg":
122
+ audio = audio.read()
123
+ audio, _ = (ffmpeg
124
+ .input('pipe:0')
125
+ .output('pipe:1', format='wav', acodec='pcm_s16le', ac=1, ar='16k')
126
+ .run(capture_stdout=True, input=audio)
127
+ )
128
+ audio = io.BytesIO(audio)
129
+ wave, sr = torchaudio.load(audio)
130
+ if sr != 16000:
131
+ wave = torchaudio.transforms.Resample(sr, 16000)(wave)
132
+ return wave.squeeze(0)
133
+
134
+ else:
135
+ st.error("Unsupported audio format")
136
 
137
  def detect_language(model, mel):
138
  _, probs = model.detect_language(mel)
 
155
  st.sidebar.write(f"Model: {model_selection+' (Multilingual)' if not en_model_selection else model_selection + ' (English only)'}")
156
 
157
  if st.sidebar.checkbox("Show supported languages", value=False):
158
+ st.sidebar.info(list(LANGUAGES.values()))
159
  st.sidebar.title("Options")
160
 
161
  beam_size = st.sidebar.slider("Beam Size", min_value=1, max_value=10, value=5)
 
170
  audio_file = st.file_uploader("Upload Audio", type=["wav", "mp3", "flac"])
171
 
172
  if audio_file is not None:
173
+ st.audio(audio_file, format=audio_file.type)
174
  with st.spinner("Loading model..."):
175
  model = whisper.load_model(model_selection)
176
  model = model.to("cpu") if not torch.cuda.is_available() else model.to("cuda")
 
183
  if not en_model_selection:
184
  with st.spinner("Detecting language..."):
185
  language = detect_language(model, mel)
186
+ st.markdown(f"Detected Language: {LANGUAGES[language]} ({language})")
187
  else:
188
  language = "en"
189
  configuration = {"beam_size": beam_size, "fp16": fp16, "task": task, "language": language}
requirements.txt CHANGED
@@ -3,6 +3,7 @@ numpy
3
  torch
4
  torchaudio
5
  tqdm
 
6
  more-itertools
7
  transformers>=4.19.0
8
  ffmpeg-python==0.2.0
 
3
  torch
4
  torchaudio
5
  tqdm
6
+ ffmpeg-python
7
  more-itertools
8
  transformers>=4.19.0
9
  ffmpeg-python==0.2.0