marquesafonso
commited on
Commit
•
ffa3aaf
1
Parent(s):
655abb7
add mvp api with desired functionalities
Browse files- Pipfile +15 -0
- Pipfile.lock +0 -0
- app.py +21 -0
- requirements.txt +0 -0
- src/transcriber.py +59 -0
Pipfile
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[[source]]
|
2 |
+
url = "https://pypi.org/simple"
|
3 |
+
verify_ssl = true
|
4 |
+
name = "pypi"
|
5 |
+
|
6 |
+
[packages]
|
7 |
+
faster-whisper = "*"
|
8 |
+
gradio = "*"
|
9 |
+
moviepy = "*"
|
10 |
+
|
11 |
+
[dev-packages]
|
12 |
+
|
13 |
+
[requires]
|
14 |
+
python_version = "3.11"
|
15 |
+
python_full_version = "3.11.9"
|
Pipfile.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from src.transcriber import transcriber
|
3 |
+
|
4 |
+
def main():
|
5 |
+
with gr.Blocks(analytics_enabled=False, title='multilang-asr-transcriber') as demo:
|
6 |
+
gr.Markdown('# multilang-asr-transcriber')
|
7 |
+
gr.Markdown('### A multilingual automatic speech transcription tool using [faster-whisper](https://github.com/SYSTRAN/faster-whisper). Supports translation to english and user setting of max words per line.',)
|
8 |
+
video_file = gr.File(file_types=["video"],type="filepath")
|
9 |
+
max_words_per_line = gr.Number(value=6, label="Max words per line")
|
10 |
+
task = gr.Dropdown(choices=["transcribe", "translate"], value="transcribe", label="Select Task")
|
11 |
+
text_output = gr.Textbox(label="Text transcription")
|
12 |
+
srt_file = gr.File(file_count="single", file_types=[".srt"], label="SRT file")
|
13 |
+
gr.Interface(transcriber,
|
14 |
+
inputs=[video_file, max_words_per_line, task],
|
15 |
+
outputs=[text_output,srt_file],
|
16 |
+
allow_flagging="never",
|
17 |
+
analytics_enabled=False)
|
18 |
+
demo.launch()
|
19 |
+
|
20 |
+
if __name__ == '__main__':
|
21 |
+
main()
|
requirements.txt
ADDED
Binary file (6.88 kB). View file
|
|
src/transcriber.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
from faster_whisper import WhisperModel
|
4 |
+
from moviepy.editor import VideoFileClip
|
5 |
+
|
6 |
+
def convert_video_to_audio(video_input):
|
7 |
+
video_clip = VideoFileClip(video_input)
|
8 |
+
audio_clip = video_clip.audio
|
9 |
+
audio_clip_filepath = os.path.normpath(f"{video_input.split('.')[0]}.m4a")
|
10 |
+
audio_clip.write_audiofile(audio_clip_filepath, codec='aac')
|
11 |
+
audio_clip.close()
|
12 |
+
video_clip.close()
|
13 |
+
return audio_clip_filepath
|
14 |
+
|
15 |
+
def convert_seconds_to_time(seconds):
|
16 |
+
seconds = float(seconds)
|
17 |
+
hours, remainder = divmod(seconds, 3600)
|
18 |
+
minutes, remainder = divmod(remainder, 60)
|
19 |
+
whole_seconds = int(remainder)
|
20 |
+
milliseconds = int((remainder - whole_seconds) * 1000)
|
21 |
+
return f"{int(hours):02}:{int(minutes):02}:{whole_seconds:02},{milliseconds:03}"
|
22 |
+
|
23 |
+
def write_srt(segments, max_words_per_line, srt_path):
|
24 |
+
with open(srt_path, "w", encoding='utf-8') as file:
|
25 |
+
result = ''
|
26 |
+
line_counter = 1
|
27 |
+
for _, segment in enumerate(segments):
|
28 |
+
words_in_line = []
|
29 |
+
for w, word in enumerate(segment.words):
|
30 |
+
words_in_line.append(word)
|
31 |
+
# Write the line if max words limit reached or it's the last word in the segment
|
32 |
+
if len(words_in_line) == max_words_per_line or w == len(segment.words) - 1:
|
33 |
+
if words_in_line: # Check to avoid writing a line if there are no words
|
34 |
+
start_time = convert_seconds_to_time(words_in_line[0].start)
|
35 |
+
end_time = convert_seconds_to_time(words_in_line[-1].end)
|
36 |
+
line_text = ' '.join([w.word.strip() for w in words_in_line])
|
37 |
+
result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
|
38 |
+
# Reset for the next line and increment line counter
|
39 |
+
line_counter += 1
|
40 |
+
words_in_line = [] # Reset words list for the next line
|
41 |
+
file.write(result)
|
42 |
+
return result, srt_path
|
43 |
+
|
44 |
+
def transcriber(video_input:gr.File,
|
45 |
+
max_words_per_line:int,
|
46 |
+
task:str):
|
47 |
+
srt_filepath = os.path.normpath(f"{video_input.split('.')[0]}.srt")
|
48 |
+
audio_input = convert_video_to_audio(video_input)
|
49 |
+
model_size = "large-v3"
|
50 |
+
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
51 |
+
segments, _ = model.transcribe(
|
52 |
+
audio_input,
|
53 |
+
beam_size=5,
|
54 |
+
task=task,
|
55 |
+
vad_filter=True,
|
56 |
+
vad_parameters=dict(min_silence_duration_ms=500),
|
57 |
+
word_timestamps=True
|
58 |
+
)
|
59 |
+
return write_srt(segments=segments, max_words_per_line=max_words_per_line, srt_path=srt_filepath)
|