File size: 1,872 Bytes
048e1ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
import torch
from faster_whisper import WhisperModel
import pandas as pd
model_size = "large-v2"
# get device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
if device == "cuda:0":
# Run on GPU with FP16
model_whisper = WhisperModel(model_size, device="cuda", compute_type="float16")
# or Run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
else:
# Run on CPU with INT8
model_whisper = WhisperModel(model_size, device="cpu", compute_type="int8")
def get_filename(file_obj):
return file_obj.name.split("/")[-1]
def audio_to_transcript(file_obj):
# get all audio segments
segments, _ = model_whisper.transcribe(file_obj.name, beam_size=5, vad_filter=True)
print("start")
start_segments, end_segments, text_segments = list(), list(), list()
for segment in segments:
start, end, text = segment.start, segment.end, segment.text
start_segments.append(start)
end_segments.append(end)
text_segments.append(text)
# save transcript into csv
df = pd.DataFrame()
df["start"] = start_segments
df["end"] = end_segments
df["text"] = text_segments
print(df)
return get_filename(file_obj), df
## Gradio interface
headers = ["start", "end", "text"]
iface = gr.Interface(fn=audio_to_transcript,
inputs=gr.File(label="Audio file"),
outputs=[
gr.Textbox(label="Name of the audio file"),
gr.DataFrame(label="Transcript", headers=headers),
],
allow_flagging="never",
title="Audio to Transcript",
description="Just paste any audio file and get its corresponding transcript with timeline.",
)
iface.launch() |