File size: 2,880 Bytes
abaf86c
 
20baefb
9d22070
d3e2fa4
 
abaf86c
7069d5c
9417b92
abaf86c
 
7069d5c
f49454a
b23d0f5
7069d5c
20baefb
 
 
 
 
 
 
 
d3e2fa4
 
 
 
 
 
 
 
 
20baefb
 
 
d3e2fa4
20baefb
d3e2fa4
da208e4
5eb56a0
 
20baefb
b152137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20baefb
 
 
 
9074304
20baefb
b152137
 
 
 
 
 
 
 
 
 
20baefb
 
 
 
 
b152137
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from transformers import pipeline
import gradio as gr
from pytube import YouTube
from datasets import Dataset, Audio
import os
from moviepy.editor import AudioFileClip

pipe1 = pipeline(model="khalidey/ID2223_Lab2_Whisper_SV")  # change to "your-username/the-name-you-picked"
pipe2 = pipeline('text-generation', model='birgermoell/swedish-gpt')

def transcribe(audio):
    text = pipe1(audio)["text"]
    generated_text = pipe2(text, max_length=50, num_return_sequences=2)[0]['generated_text']
    return text, generated_text

def youtube_link(url):
    
    # Obtains the audio of the youtube video and returns the path of the mp4 file

    streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
    path = streams.first().download()
    return path

def convert_to_wav(path):
    
    sound = AudioFileClip(path)

    segment = sound.subclip(0, int(sound.duration))
    segment.write_audiofile(f"segment.wav")

    return f"segment.wav"

def youtube_transcribe(url):
    
    path = youtube_link(url)
    path_wav = convert_to_wav(path)
    
    audio_dataset = Dataset.from_dict({"audio": list(path_wav)}).cast_column("audio", Audio(sampling_rate=16000))
    text = pipe1(audio_dataset[0]["audio"])["text"]

    return text
    
with gr.Blocks() as demo:
    gr.Markdown("Whisper Small Swedish + Swedish GPT")
    gr.Markdown("Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model & text generation with Swedish GPT.")
    with gr.TabItem("Upload from disk"):
        upload_file = gr.Audio(source="upload", type="filepath",label="Upload from disk")
        upload_button = gr.Button("Submit for recognition")
        upload_outputs = [
            gr.Textbox(label="Recognized speech from uploaded file"),
            gr.Textbox(label="Swedish-gpt generated speech from uploaded file")
        ]
    with gr.TabItem("Record from microphone"):
        record_file = gr.Audio(source="microphone", type="filepath",label="Record from microphone")
        record_button = gr.Button("Submit for recognition")
        record_outputs = [
            gr.Textbox(label="Recognized speech from recordings"),
            gr.Textbox(label="Swedish-gpt generated speech from recordings")
        ]
    with gr.TabItem("Transcribe from Youtube URL"):
        url = gr.Text(max_lines=1, label="Transcribe from YouTube URL")
        youtube_button = gr.Button("Submit for recognition")
        youtube_outputs = [
            gr.Textbox(label="Recognized speech from URL")
        ]    
    upload_button.click(
        fn=transcribe,
        inputs=upload_file,
        outputs=upload_outputs,
    )
    record_button.click(
        fn=transcribe,
        inputs=record_file,
        outputs=record_outputs,
    )
    youtube_button.click(
    fn=youtube_transcribe,
    inputs=url,
    outputs=youtube_outputs,
    )
        
demo.launch()