irena commited on
Commit
77f334e
·
1 Parent(s): 8106fd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -1
app.py CHANGED
@@ -1,3 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
3
  from transformers import pipeline
@@ -69,7 +205,7 @@ with demo:
69
 
70
  demo.launch(enable_queue=True)
71
 
72
-
73
 
74
 
75
 
 
1
+
2
+ import os
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ from pytube import YouTube
6
+ from datasets import Dataset, Audio
7
+ from moviepy.editor import AudioFileClip
8
+
9
+ pipe = pipeline(model="irena/whisper-small-sv-SE")
10
+
11
+ def download_from_youtube(url):
12
+ """
13
+ Downloads the video from the given YouTube URL and returns the path to the audio file.
14
+ """
15
+ streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
16
+ fpath = streams.first().download()
17
+ return fpath
18
+
19
+ def get_timestamp(seconds):
20
+ """
21
+ Creates %M:%S timestamp from seconds.
22
+ """
23
+ minutes = int(seconds / 60)
24
+ seconds = int(seconds % 60)
25
+ return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"
26
+
27
+ def divide_into_30s_segments(audio_fpath, seconds_max):
28
+ """
29
+ Divides the audio file into 30s segments and returns the paths to the segments and the start times of the segments.
30
+ :param audio_fpath: Path to the audio file.
31
+ :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
32
+ """
33
+ if not os.path.exists("segmented_audios"):
34
+ os.makedirs("segmented_audios")
35
+
36
+ sound = AudioFileClip(audio_fpath)
37
+ n_full_segments = int(sound.duration / 30)
38
+ len_last_segment = sound.duration % 30
39
+
40
+ max_segments = int(seconds_max / 30)
41
+ if n_full_segments > max_segments:
42
+ n_full_segments = max_segments
43
+ len_last_segment = 0
44
+
45
+ segment_paths = []
46
+ segment_start_times = []
47
+
48
+ segments_available = n_full_segments + 1
49
+ for i in range(min(segments_available, max_segments)):
50
+ start = i * 30
51
+
52
+ # Skip last segment if it is smaller than two seconds
53
+ is_last_segment = i == n_full_segments
54
+ if is_last_segment and not len_last_segment > 2:
55
+ continue
56
+ elif is_last_segment:
57
+ end = start + len_last_segment
58
+ else:
59
+ end = (i + 1) * 30
60
+
61
+ segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
62
+ segment = sound.subclip(start, end)
63
+ segment.write_audiofile(segment_path)
64
+ segment_paths.append(segment_path)
65
+ segment_start_times.append(start)
66
+
67
+ return segment_paths, segment_start_times
68
+ def get_translation(text):
69
+ """
70
+ Translates the given Chinese text to English.
71
+ """
72
+ return "TODO: Make API call to Google Translate to get English translation"
73
+
74
+ def transcribe(audio, url, seconds_max):
75
+ """
76
+ Transcribes a YouTube video if a url is specified and returns the transcription.
77
+ If not url is specified, it transcribes the audio file as passed by Gradio.
78
+ :param audio: Audio file as passed by Gradio. Only used if no url is specified.
79
+ :param url: YouTube URL to transcribe.
80
+ :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
81
+ """
82
+ if url:
83
+ fpath = download_from_youtube(url)
84
+ segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)
85
+
86
+ audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
87
+ pred = pipe(audio_dataset["audio"])
88
+ text = ""
89
+ n_segments = len(segment_start_times)
90
+ for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
91
+ text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
92
+ text += f"{output['text']}\n"
93
+ text += f"[Translation]\n{get_translation(output['text'])}\n\n"
94
+ return text
95
+
96
+ else:
97
+ text = pipe(audio)["text"]
98
+ return text
99
+
100
+ block = gr.Interface(
101
+ fn=transcribe,
102
+ inputs=[
103
+ gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
104
+ gr.Text(max_lines=1, placeholder="Enter YouTube Link which has a Chinese video", label="Transcribe from YouTube URL"),
105
+ gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to transcribe from YouTube URL")
106
+ ],
107
+ outputs="text",
108
+ title="Whisper Small Chinese",
109
+ description="Realtime Chinese speech recognition",
110
+ )
111
+
112
+ block.launch()
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+ '''
137
  import os
138
  import gradio as gr
139
  from transformers import pipeline
 
205
 
206
  demo.launch(enable_queue=True)
207
 
208
+ '''
209
 
210
 
211