File size: 1,210 Bytes
ffde417
 
cad9f2f
 
ffde417
cad9f2f
ffde417
cad9f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffde417
 
 
 
cad9f2f
 
 
 
ffde417
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from transformers import pipeline
import gradio as gr
from pytube import YouTube
from transformers import Dataset, Audio

pipe = pipeline(model="Neprox/model")

def transcribe(audio, url):
    if url:
        # Download YouTube video
        streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
        audio_fpath = streams.first().download()

        # TODO:
        # Process up to 10 minutes by segmenting into 30 second blocks
        # Use pyMovie for selecting time ranges
        # query every block individually
        # Annotate text with timestamps

        audio_dataset = Dataset.from_dict({"audio": [audio_fpath]}).cast_column("audio", Audio())
        text = pipe(audio_dataset[0]["audio"])
        return text

    else:
        text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath")
        gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be transcribed")
    ], 
    outputs="text",
    title="Whisper Small Swedish",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()