irena commited on
Commit
306c96d
·
1 Parent(s): 77f334e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -189
app.py CHANGED
@@ -1,212 +1,48 @@
1
-
2
- import os
3
  import gradio as gr
4
  from transformers import pipeline
5
  from pytube import YouTube
6
- from datasets import Dataset, Audio
7
- from moviepy.editor import AudioFileClip
8
 
9
  pipe = pipeline(model="irena/whisper-small-sv-SE")
10
 
11
- def download_from_youtube(url):
12
- """
13
- Downloads the video from the given YouTube URL and returns the path to the audio file.
14
- """
15
- streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
16
- fpath = streams.first().download()
17
- return fpath
18
-
19
- def get_timestamp(seconds):
20
- """
21
- Creates %M:%S timestamp from seconds.
22
- """
23
- minutes = int(seconds / 60)
24
- seconds = int(seconds % 60)
25
- return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"
26
-
27
- def divide_into_30s_segments(audio_fpath, seconds_max):
28
- """
29
- Divides the audio file into 30s segments and returns the paths to the segments and the start times of the segments.
30
- :param audio_fpath: Path to the audio file.
31
- :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
32
- """
33
- if not os.path.exists("segmented_audios"):
34
- os.makedirs("segmented_audios")
35
-
36
- sound = AudioFileClip(audio_fpath)
37
- n_full_segments = int(sound.duration / 30)
38
- len_last_segment = sound.duration % 30
39
-
40
- max_segments = int(seconds_max / 30)
41
- if n_full_segments > max_segments:
42
- n_full_segments = max_segments
43
- len_last_segment = 0
44
-
45
- segment_paths = []
46
- segment_start_times = []
47
-
48
- segments_available = n_full_segments + 1
49
- for i in range(min(segments_available, max_segments)):
50
- start = i * 30
51
-
52
- # Skip last segment if it is smaller than two seconds
53
- is_last_segment = i == n_full_segments
54
- if is_last_segment and not len_last_segment > 2:
55
- continue
56
- elif is_last_segment:
57
- end = start + len_last_segment
58
- else:
59
- end = (i + 1) * 30
60
-
61
- segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
62
- segment = sound.subclip(start, end)
63
- segment.write_audiofile(segment_path)
64
- segment_paths.append(segment_path)
65
- segment_start_times.append(start)
66
-
67
- return segment_paths, segment_start_times
68
- def get_translation(text):
69
- """
70
- Translates the given Chinese text to English.
71
- """
72
- return "TODO: Make API call to Google Translate to get English translation"
73
-
74
- def transcribe(audio, url, seconds_max):
75
- """
76
- Transcribes a YouTube video if a url is specified and returns the transcription.
77
- If not url is specified, it transcribes the audio file as passed by Gradio.
78
- :param audio: Audio file as passed by Gradio. Only used if no url is specified.
79
- :param url: YouTube URL to transcribe.
80
- :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
81
- """
82
- if url:
83
- fpath = download_from_youtube(url)
84
- segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)
85
-
86
- audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
87
- pred = pipe(audio_dataset["audio"])
88
- text = ""
89
- n_segments = len(segment_start_times)
90
- for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
91
- text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
92
- text += f"{output['text']}\n"
93
- text += f"[Translation]\n{get_translation(output['text'])}\n\n"
94
- return text
95
-
96
- else:
97
- text = pipe(audio)["text"]
98
- return text
99
-
100
- block = gr.Interface(
101
- fn=transcribe,
102
- inputs=[
103
- gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
104
- gr.Text(max_lines=1, placeholder="Enter YouTube Link which has a Chinese video", label="Transcribe from YouTube URL"),
105
- gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to transcribe from YouTube URL")
106
- ],
107
- outputs="text",
108
- title="Whisper Small Chinese",
109
- description="Realtime Chinese speech recognition",
110
- )
111
-
112
- block.launch()
113
-
114
-
115
-
116
-
117
-
118
-
119
-
120
-
121
-
122
-
123
-
124
-
125
-
126
-
127
-
128
-
129
-
130
 
 
 
 
131
 
132
 
133
 
134
-
135
-
136
- '''
137
- import os
138
- import gradio as gr
139
- from transformers import pipeline
140
- import gradio as gr
141
- import torch
142
- import spacy
143
-
144
- os.system('pip install https://huggingface.co/Armandoliv/es_pipeline/resolve/main/es_pipeline-any-py3-none-any.whl')
145
-
146
- pipe = pipeline(model="irena/whisper-small-sv-SE")
147
- nlp_ner = spacy.load("es_pipeline")
148
- def main_generator(youtube_id:str):
149
- YouTubeID = youtube_id.split("https://www.youtube.com/watch?v=") #
150
- if len(YouTubeID)>1:
151
- YouTubeID = YouTubeID[1]
152
- else:
153
- YouTubeID ='xOZM-1p-jAk'
154
-
155
- OutputFile = f'test_audio_youtube_{YouTubeID}.m4a'
156
-
157
- os.system(f"youtube-dl -o {OutputFile} {YouTubeID} --extract-audio --restrict-filenames -f 'bestaudio[ext=m4a]'")
158
-
159
- result = pipe(OutputFile)
160
- text = result['text']
161
-
162
- output_list = []
163
-
164
- output_list.append(text)
165
-
166
- return text
167
-
168
-
169
-
170
- def transcribe(audio):
171
- text = pipe(audio)["text"]
172
- return text
173
-
174
- demo = gr.Blocks()
175
-
176
-
177
- iface = gr.Interface(
178
- fn=transcribe,
179
  inputs=gr.Audio(source="microphone", type="filepath"),
180
  outputs="text",
181
- title="Whisper Small Swedish-Microphone",
182
- description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model. An audio for recognize.",
183
  )
184
 
185
- inputs = [gr.Textbox(lines=1, placeholder="Link of youtube video here...", label="Input")]
186
- outputs = gr.HighlightedText()
187
- title="Transcription of Swedish videos"
188
- description = "This demo uses small Whisper to transcribe what is spoken in a swedish video"
189
- examples = ['https://www.youtube.com/watch?v=6eWhV7xYH-Q']
190
- io = gr.Interface(fn=main_generator, inputs=inputs, outputs=outputs, title=title, description = description, examples = examples,
191
 
192
- css= """.gr-button-primary { background: -webkit-linear-gradient(
193
- 90deg, #355764 0%, #55a8a1 100% ) !important; background: #355764;
194
- background: linear-gradient(
195
- 90deg, #355764 0%, #55a8a1 100% ) !important;
196
- background: -moz-linear-gradient( 90deg, #355764 0%, #55a8a1 100% ) !important;
197
- background: -webkit-linear-gradient(
198
- 90deg, #355764 0%, #55a8a1 100% ) !important;
199
- color:white !important}"""
200
- )
201
 
202
 
203
- with demo:
204
- gr.TabbedInterface([iface, yt], ["Transcribe Audio", "Transcribe YouTube"])
205
 
206
- demo.launch(enable_queue=True)
207
 
208
- '''
 
209
 
 
210
 
211
 
212
 
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  from pytube import YouTube
 
 
4
 
5
  pipe = pipeline(model="irena/whisper-small-sv-SE")
6
 
7
+ def transcribe_video(url):
8
+ yt=YouTube(url).streams.filter(only_audio=True).all()
9
+ audio=yt[0].download()
10
+ text = pipe(audio)["text"]
11
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def transcribe_audio(audio):
14
+ text = pipe(audio)["text"]
15
+ return text
16
 
17
 
18
 
19
+ audio = gr.Interface(
20
+ fn=transcribe_audio,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  inputs=gr.Audio(source="microphone", type="filepath"),
22
  outputs="text",
23
+ title="Whisper Small Swedish",
24
+ description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
25
  )
26
 
 
 
 
 
 
 
27
 
28
+
29
+ video = gr.Interface(
30
+ fn=transcribe_video,
31
+ inputs=gr.Textbox(label="Enter a YouTube URL:"),
32
+ outputs="text",
33
+ title="Whisper Small Swedish",
34
+ description="Transcribe swedish videos from YouTube",
35
+ )
36
+
37
 
38
 
 
 
39
 
40
+ demo = gr.TabbedInterface([audio, video], ["transcribe from recording", "transcribe from youtube url"])
41
 
42
+ if __name__ == "__main__":
43
+ demo.launch()
44
 
45
+
46
 
47
 
48