File size: 4,877 Bytes
d6a25cd
023302c
 
 
7fe7ac2
023302c
d6a25cd
c731f97
d6a25cd
023302c
 
 
 
 
 
 
 
d6a25cd
 
 
023302c
 
 
 
7fe7ac2
 
 
 
 
 
 
 
 
 
 
023302c
 
 
 
 
05f807a
 
023302c
d7d6d65
023302c
d7d6d65
c731f97
023302c
 
 
 
 
 
d7d6d65
 
 
 
 
 
 
 
 
023302c
 
 
 
d7d6d65
 
 
 
 
 
 
 
 
023302c
 
 
d7d6d65
 
 
 
 
 
 
 
 
 
 
023302c
 
 
7fe7ac2
d7d6d65
 
 
 
 
 
 
 
 
 
 
7fe7ac2
 
 
 
d7d6d65
 
 
 
 
 
 
 
 
 
 
023302c
 
 
7fe7ac2
023302c
 
 
 
 
 
 
 
 
 
7fe7ac2
023302c
d6a25cd
023302c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
from pytube import YouTube
from transformers import pipeline
import os
from moviepy.editor import VideoFileClip


pipe = pipeline(model="GIanlucaRub/whisper-small-it-3",task="automatic-speech-recognition")

def transcribe_yt(link):
  yt = YouTube(link)
  audio = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp3")
  text = pipe(audio)["text"]
  os.remove(audio)
  return text

def transcribe_audio(audio):
    text = pipe(audio)["text"]
    return text

def populate_metadata(link):
  yt = YouTube(link)
  return yt.thumbnail_url, yt.title

def transcribe_video(video):
    clip = VideoFileClip(video)
    audio = video[:-4] + ".mp3"
    clip.audio.write_audiofile(audio)
    clip.close()
    os.remove(video)
    text = transcribe_audio(audio)
    os.remove(audio)
    
    return text

block = gr.Blocks()

with block:
    gr.HTML(
        """

            <div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
              <div>
                <h1 style="font-size: 400%;line-height: 1.2;">Whisper Italian Automatic Speech Recognition</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 150%;margin-top: 30px;line-height: 1.2;">
                Realtime demo for Italian speech recognition using a fine-tuned Whisper Small model.You can use the model in 4 different ways.
              </p>
            </div>
        """
    )
    with gr.Group():
        with gr.Box():
          gr.HTML(
        """

            <div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
              <p style="margin-bottom: 10px; font-size: 100%;margin-top: 10px;line-height: 1.2;">
                  Here you can see the transcription.
              </p>
            </div>
        """)
          text = gr.Textbox(
              label="Transcription", 
              placeholder="Transcription Output",
              lines=5)
          gr.HTML(
        """

            <div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
              <p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.0;">
                  You can record audio from your microphone.
              </p>
            </div>
        """)  
          microphone=gr.Audio(source="microphone", type="filepath")
          with gr.Row().style(mobile_collapse=False, equal_height=True): 
              btn_microphone = gr.Button("Transcribe microphone audio")
          
        
          gr.HTML(
        """

            <div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
              <p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.2;">
                  You can upload an audio file.
              </p>
            </div>
        """)  
          audio_uploaded=gr.Audio(source="upload", type="filepath")
          with gr.Row().style(mobile_collapse=False, equal_height=True): 
              btn_audio_uploaded = gr.Button("Transcribe audio uploaded")
          
        
        
          gr.HTML(
        """

            <div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
              <p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.2;">
                  You can upload a video file
              </p>
            </div>
        """) 
          video_uploaded = gr.Video(source = "upload") 
          with gr.Row().style(mobile_collapse=False, equal_height=True): 
              btn_video_uploaded = gr.Button("Transcribe video uploaded")
            
          
        
          gr.HTML(
        """

            <div style="text-align: center; max-width: 500px; margin: 0 auto;margin-top: 10px">
              <p style="margin-bottom: 10px; font-size: 100%;margin-top: 20px;line-height: 1.2;">
                  You can put a youtube video link
              </p>
            </div>
        """) 
          link = gr.Textbox(label="YouTube Link")
          with gr.Row().style(mobile_collapse=False, equal_height=True): 
              btn_youtube = gr.Button("Transcribe Youtube video") 
    
          with gr.Row().style(mobile_collapse=False, equal_height=True):
            title = gr.Label(label="Video Title", placeholder="Title")
            img = gr.Image(label="Thumbnail")
          
                
          
          # Events
          btn_youtube.click(transcribe_yt, inputs=[link], outputs=[text])
          btn_microphone.click(transcribe_audio, inputs=[microphone], outputs=[text])
          btn_audio_uploaded.click(transcribe_audio, inputs=[audio_uploaded], outputs=[text])
          btn_video_uploaded.click(transcribe_video, inputs=[video_uploaded], outputs=[text])
          link.change(populate_metadata, inputs=[link], outputs=[img, title])

block.launch(debug=True)