File size: 2,257 Bytes
048e1ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a05b433
 
 
 
 
 
 
048e1ca
 
 
 
 
 
 
 
 
 
 
 
 
a05b433
 
 
048e1ca
a05b433
048e1ca
 
 
 
 
 
a05b433
 
048e1ca
 
 
 
 
 
400fdcc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
import torch
from faster_whisper import WhisperModel
import pandas as pd

model_size = "large-v2"

# get device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

if device == "cuda:0":
    # Run on GPU with FP16
    model_whisper = WhisperModel(model_size, device="cuda", compute_type="float16")
    # or Run on GPU with INT8
    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
else:
    # Run on CPU with INT8
    model_whisper = WhisperModel(model_size, device="cpu", compute_type="int8")

def get_filename(file_obj):
    return file_obj.name.split("/")[-1]

def audio_to_transcript(file_obj):
    # get all audio segments
    try:
        filename = get_filename(file_obj)
        segments, _ = model_whisper.transcribe(file_obj.name, beam_size=5, vad_filter=True)
    except:
        filename = file_obj.split("/")[-1]
        segments, _ = model_whisper.transcribe(file_obj, beam_size=5, vad_filter=True)
 
    start_segments, end_segments, text_segments = list(), list(), list()
    for segment in segments:
        start, end, text = segment.start, segment.end, segment.text
        start_segments.append(start)
        end_segments.append(end)
        text_segments.append(text)

    # save transcript into csv
    df = pd.DataFrame()
    df["start"] = start_segments
    df["end"] = end_segments
    df["text"] = text_segments

    csv_file = filename.split(".")[0] + ".csv"
    df.to_csv(csv_file, encoding="utf-8", index=False)
    path_to_csv = gr.File.update(value=csv_file, visible=True)

    return filename, path_to_csv, df

## Gradio interface
headers = ["start", "end", "text"]
iface = gr.Interface(fn=audio_to_transcript,
                     inputs=gr.File(label="Audio file"),
                     outputs=[
                        gr.Textbox(label="Audio file name"),
                        gr.File(label="Transcript csv file"),
                        gr.DataFrame(label="Transcript", headers=headers),
                     ],
                     allow_flagging="never",
                     title="Audio to Transcript",
                     description="Just paste any audio file and get its corresponding transcript with timeline.",
                     )
iface.launch()