storresbusquets commited on
Commit
4c634f2
·
1 Parent(s): b397f13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -45
app.py CHANGED
@@ -1,27 +1,57 @@
 
1
  import gradio as gr
2
  import whisper
3
  from pytube import YouTube
4
- from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
 
5
 
6
- class GradioInference():
7
  def __init__(self):
 
 
8
  self.sizes = list(whisper._MODELS.keys())
 
 
9
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
 
 
10
  self.current_size = "base"
 
 
11
  self.loaded_model = whisper.load_model(self.current_size)
 
 
12
  self.yt = None
 
 
13
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
14
-
15
  # Initialize VoiceLabT5 model and tokenizer
16
- self.keyword_model = T5ForConditionalGeneration.from_pretrained("Voicelab/vlt5-base-keywords")
17
- self.keyword_tokenizer = T5Tokenizer.from_pretrained("Voicelab/vlt5-base-keywords")
 
 
 
 
18
 
19
  # Sentiment Classifier
20
  self.classifier = pipeline("text-classification")
21
 
22
  def __call__(self, link, lang, size):
 
 
 
 
 
 
 
 
 
 
23
  if self.yt is None:
24
  self.yt = YouTube(link)
 
 
25
  path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
26
 
27
  if lang == "none":
@@ -30,33 +60,66 @@ class GradioInference():
30
  if size != self.current_size:
31
  self.loaded_model = whisper.load_model(size)
32
  self.current_size = size
33
-
 
34
  results = self.loaded_model.transcribe(path, language=lang)
35
-
36
  # Perform summarization on the transcription
37
- transcription_summary = self.summarizer(results["text"], max_length=130, min_length=30, do_sample=False)
 
 
38
 
39
  # Extract keywords using VoiceLabT5
40
  task_prefix = "Keywords: "
41
  input_sequence = task_prefix + results["text"]
42
- input_ids = self.keyword_tokenizer(input_sequence, return_tensors="pt", truncation=False).input_ids
43
- output = self.keyword_model.generate(input_ids, no_repeat_ngram_size=3, num_beams=4)
 
 
 
 
44
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
45
- keywords = [x.strip() for x in predicted.split(',') if x.strip()]
46
 
 
47
  label = self.classifier(results["text"])[0]["label"]
48
 
49
- return results["text"], transcription_summary[0]["summary_text"], keywords, label
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  def populate_metadata(self, link):
 
 
 
 
 
52
  self.yt = YouTube(link)
53
  return self.yt.thumbnail_url, self.yt.title
54
 
55
-
56
  def from_audio_input(self, lang, size, audio_file):
 
 
 
 
 
 
 
 
 
57
  if lang == "none":
58
  lang = None
59
-
60
  if size != self.current_size:
61
  self.loaded_model = whisper.load_model(size)
62
  self.current_size = size
@@ -64,19 +127,38 @@ class GradioInference():
64
  results = self.loaded_model.transcribe(audio_file, language=lang)
65
 
66
  # Perform summarization on the transcription
67
- transcription_summary = self.summarizer(results["text"], max_length=130, min_length=30, do_sample=False)
 
 
68
 
69
  # Extract keywords using VoiceLabT5
70
  task_prefix = "Keywords: "
71
  input_sequence = task_prefix + results["text"]
72
- input_ids = self.keyword_tokenizer(input_sequence, return_tensors="pt", truncation=False).input_ids
73
- output = self.keyword_model.generate(input_ids, no_repeat_ngram_size=3, num_beams=4)
 
 
 
 
74
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
75
- keywords = [x.strip() for x in predicted.split(',') if x.strip()]
76
 
 
77
  label = self.classifier(results["text"])[0]["label"]
78
 
79
- return results["text"], transcription_summary[0]["summary_text"], keywords, label
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  gio = GradioInference()
@@ -101,56 +183,106 @@ with block as demo:
101
  with gr.Tab("From YouTube"):
102
  with gr.Box():
103
  with gr.Row().style(equal_height=True):
104
- size = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
105
- lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
106
- link = gr.Textbox(label="YouTube Link", placeholder="Enter YouTube link...")
 
 
 
 
 
 
107
  title = gr.Label(label="Video Title")
108
  with gr.Row().style(equal_height=True):
109
  img = gr.Image(label="Thumbnail")
110
- text = gr.Textbox(label="Transcription", placeholder="Transcription Output...", lines=10).style(show_copy_button=True, container=True)
 
 
 
 
111
  with gr.Row().style(equal_height=True):
112
- summary = gr.Textbox(label="Summary", placeholder="Summary Output...", lines=5).style(show_copy_button=True, container=True)
113
- keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output...", lines=5).style(show_copy_button=True, container=True)
 
 
 
 
114
  label = gr.Label(label="Sentiment Analysis")
 
115
  with gr.Row().style(equal_height=True):
116
- clear = gr.ClearButton([link, title, img, text, summary, keywords, label], scale=1)
117
- btn = gr.Button("Get video insights", variant='primary', scale=1)
118
- btn.click(gio, inputs=[link, lang, size], outputs=[text, summary, keywords, label])
119
- link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
 
 
 
 
 
 
 
120
 
121
  with gr.Tab("From Audio file"):
122
  with gr.Box():
123
  with gr.Row().style(equal_height=True):
124
- size = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
125
- lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
 
 
 
 
126
  audio_file = gr.Audio(type="filepath")
127
  with gr.Row().style(equal_height=True):
128
- text = gr.Textbox(label="Transcription", placeholder="Transcription Output...", lines=10).style(show_copy_button=True, container=False)
 
 
 
 
129
  with gr.Row().style(equal_height=True):
130
- summary = gr.Textbox(label="Summary", placeholder="Summary Output", lines=5)
131
- keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output", lines=5)
 
 
 
 
132
  label = gr.Label(label="Sentiment Analysis")
 
133
  with gr.Row().style(equal_height=True):
134
- clear = gr.ClearButton([text], scale=1)
135
- btn = gr.Button("Get video insights", variant='primary', scale=1) # Updated button label
136
- btn.click(gio.from_audio_input, inputs=[lang, size, audio_file], outputs=[text, summary, keywords, label])
137
-
 
 
 
 
 
 
138
 
139
  with block:
 
 
 
140
  gr.Markdown("About the app:")
141
-
142
  with gr.Accordion("What is YouTube Insights?", open=False):
143
- gr.Markdown("YouTube Insights is a tool developed with academic purposes only, that creates summaries, keywords and sentiments analysis based on YouTube videos or user audio files.")
144
-
 
 
145
  with gr.Accordion("How does it work?", open=False):
146
- gr.Markdown("Works by using OpenAI's Whisper, DistilBART for summarization and VoiceLabT5 for Keyword Extraction.")
 
 
147
 
148
- gr.HTML("""
 
149
  <div style="text-align: center; max-width: 500px; margin: 0 auto;">
150
  <p style="margin-bottom: 10px; font-size: 96%">
151
  2023 Master in Big Data & Data Science - Universidad Complutense de Madrid
152
  </p>
153
  </div>
154
- """)
 
155
 
156
- demo.launch()
 
1
+ # Imports
2
  import gradio as gr
3
  import whisper
4
  from pytube import YouTube
5
+ from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
6
+ from wordcloud import WordCloud
7
 
8
+ class GradioInference:
9
  def __init__(self):
10
+
11
+ # OpenAI's Whisper model sizes
12
  self.sizes = list(whisper._MODELS.keys())
13
+
14
+ # Whisper's available languages for ASR
15
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
16
+
17
+ # Default size
18
  self.current_size = "base"
19
+
20
+ # Default model size
21
  self.loaded_model = whisper.load_model(self.current_size)
22
+
23
+ # Initialize Pytube Object
24
  self.yt = None
25
+
26
+ # Initialize summary model
27
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
28
+
29
  # Initialize VoiceLabT5 model and tokenizer
30
+ self.keyword_model = T5ForConditionalGeneration.from_pretrained(
31
+ "Voicelab/vlt5-base-keywords"
32
+ )
33
+ self.keyword_tokenizer = T5Tokenizer.from_pretrained(
34
+ "Voicelab/vlt5-base-keywords"
35
+ )
36
 
37
  # Sentiment Classifier
38
  self.classifier = pipeline("text-classification")
39
 
40
  def __call__(self, link, lang, size):
41
+ """
42
+ Call the Gradio Inference python class.
43
+ This class gets access to a YouTube video using python's library Pytube and downloads its audio.
44
+ Then it uses the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
45
+ Once the function has the transcription of the video it proccess it to obtain:
46
+ - Summary: using Facebook's BART transformer.
47
+ - KeyWords: using VoiceLabT5 keyword extractor.
48
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
49
+ - WordCloud: using the wordcloud python library.
50
+ """
51
  if self.yt is None:
52
  self.yt = YouTube(link)
53
+
54
+ # Pytube library to access to YouTube audio stream
55
  path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
56
 
57
  if lang == "none":
 
60
  if size != self.current_size:
61
  self.loaded_model = whisper.load_model(size)
62
  self.current_size = size
63
+
64
+ # Transcribe the audio extracted from pytube
65
  results = self.loaded_model.transcribe(path, language=lang)
66
+
67
  # Perform summarization on the transcription
68
+ transcription_summary = self.summarizer(
69
+ results["text"], max_length=512, min_length=30, do_sample=False
70
+ )
71
 
72
  # Extract keywords using VoiceLabT5
73
  task_prefix = "Keywords: "
74
  input_sequence = task_prefix + results["text"]
75
+ input_ids = self.keyword_tokenizer(
76
+ input_sequence, return_tensors="pt", truncation=False
77
+ ).input_ids
78
+ output = self.keyword_model.generate(
79
+ input_ids, no_repeat_ngram_size=3, num_beams=4
80
+ )
81
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
82
+ keywords = [x.strip() for x in predicted.split(",") if x.strip()]
83
 
84
+ # Sentiment label
85
  label = self.classifier(results["text"])[0]["label"]
86
 
87
+ # Generate WordCloud object
88
+ wordcloud = WordCloud().generate(results["text"])
89
+
90
+ # WordCloud image to display
91
+ wordcloud_image = wordcloud.to_image()
92
+
93
+ return (
94
+ results["text"],
95
+ transcription_summary[0]["summary_text"],
96
+ keywords,
97
+ label,
98
+ wordcloud_image,
99
+ )
100
 
101
  def populate_metadata(self, link):
102
+ """
103
+ Access to the YouTube video title and thumbnail image to further display it
104
+ params:
105
+ - link: a YouTube URL.
106
+ """
107
  self.yt = YouTube(link)
108
  return self.yt.thumbnail_url, self.yt.title
109
 
 
110
  def from_audio_input(self, lang, size, audio_file):
111
+ """
112
+ Call the Gradio Inference python class.
113
+ Uses it directly the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
114
+ Once the function has the transcription of the video it proccess it to obtain:
115
+ - Summary: using Facebook's BART transformer.
116
+ - KeyWords: using VoiceLabT5 keyword extractor.
117
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
118
+ - WordCloud: using the wordcloud python library.
119
+ """
120
  if lang == "none":
121
  lang = None
122
+
123
  if size != self.current_size:
124
  self.loaded_model = whisper.load_model(size)
125
  self.current_size = size
 
127
  results = self.loaded_model.transcribe(audio_file, language=lang)
128
 
129
  # Perform summarization on the transcription
130
+ transcription_summary = self.summarizer(
131
+ results["text"], max_length=512, min_length=30, do_sample=False
132
+ )
133
 
134
  # Extract keywords using VoiceLabT5
135
  task_prefix = "Keywords: "
136
  input_sequence = task_prefix + results["text"]
137
+ input_ids = self.keyword_tokenizer(
138
+ input_sequence, return_tensors="pt", truncation=False
139
+ ).input_ids
140
+ output = self.keyword_model.generate(
141
+ input_ids, no_repeat_ngram_size=3, num_beams=4
142
+ )
143
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
144
+ keywords = [x.strip() for x in predicted.split(",") if x.strip()]
145
 
146
+ # Sentiment label
147
  label = self.classifier(results["text"])[0]["label"]
148
 
149
+ # WordCloud object
150
+ wordcloud = WordCloud().generate(
151
+ results["text"]
152
+ )
153
+ wordcloud_image = wordcloud.to_image()
154
+
155
+ return (
156
+ results["text"],
157
+ transcription_summary[0]["summary_text"],
158
+ keywords,
159
+ label,
160
+ wordcloud_image,
161
+ )
162
 
163
 
164
  gio = GradioInference()
 
183
  with gr.Tab("From YouTube"):
184
  with gr.Box():
185
  with gr.Row().style(equal_height=True):
186
+ size = gr.Dropdown(
187
+ label="Model Size", choices=gio.sizes, value="base"
188
+ )
189
+ lang = gr.Dropdown(
190
+ label="Language (Optional)", choices=gio.langs, value="none"
191
+ )
192
+ link = gr.Textbox(
193
+ label="YouTube Link", placeholder="Enter YouTube link..."
194
+ )
195
  title = gr.Label(label="Video Title")
196
  with gr.Row().style(equal_height=True):
197
  img = gr.Image(label="Thumbnail")
198
+ text = gr.Textbox(
199
+ label="Transcription",
200
+ placeholder="Transcription Output...",
201
+ lines=10,
202
+ ).style(show_copy_button=True, container=True)
203
  with gr.Row().style(equal_height=True):
204
+ summary = gr.Textbox(
205
+ label="Summary", placeholder="Summary Output...", lines=5
206
+ ).style(show_copy_button=True, container=True)
207
+ keywords = gr.Textbox(
208
+ label="Keywords", placeholder="Keywords Output...", lines=5
209
+ ).style(show_copy_button=True, container=True)
210
  label = gr.Label(label="Sentiment Analysis")
211
+ wordcloud_image = gr.Image()
212
  with gr.Row().style(equal_height=True):
213
+ clear = gr.ClearButton(
214
+ [link, title, img, text, summary, keywords, label, wordcloud_image], scale=1
215
+ )
216
+ btn = gr.Button("Get video insights", variant="primary", scale=1)
217
+ btn.click(
218
+ gio,
219
+ inputs=[link, lang, size],
220
+ outputs=[text, summary, keywords, label, wordcloud_image],
221
+ )
222
+ if link:
223
+ link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
224
 
225
  with gr.Tab("From Audio file"):
226
  with gr.Box():
227
  with gr.Row().style(equal_height=True):
228
+ size = gr.Dropdown(
229
+ label="Model Size", choices=gio.sizes, value="base"
230
+ )
231
+ lang = gr.Dropdown(
232
+ label="Language (Optional)", choices=gio.langs, value="none"
233
+ )
234
  audio_file = gr.Audio(type="filepath")
235
  with gr.Row().style(equal_height=True):
236
+ text = gr.Textbox(
237
+ label="Transcription",
238
+ placeholder="Transcription Output...",
239
+ lines=10,
240
+ ).style(show_copy_button=True, container=False)
241
  with gr.Row().style(equal_height=True):
242
+ summary = gr.Textbox(
243
+ label="Summary", placeholder="Summary Output", lines=5
244
+ )
245
+ keywords = gr.Textbox(
246
+ label="Keywords", placeholder="Keywords Output", lines=5
247
+ )
248
  label = gr.Label(label="Sentiment Analysis")
249
+ wordcloud_image = gr.Image()
250
  with gr.Row().style(equal_height=True):
251
+ clear = gr.ClearButton([audio_file,text, summary, keywords, label, wordcloud_image], scale=1)
252
+ btn = gr.Button(
253
+ "Get video insights", variant="primary", scale=1
254
+ )
255
+ btn.click(
256
+ gio.from_audio_input,
257
+ inputs=[lang, size, audio_file],
258
+ outputs=[text, summary, keywords, label, wordcloud_image],
259
+ )
260
+
261
 
262
  with block:
263
+ gr.Markdown("### Video Examples")
264
+ gr.Examples(["https://www.youtube.com/shorts/xDNzz8yAH7I"], inputs=link)
265
+
266
  gr.Markdown("About the app:")
267
+
268
  with gr.Accordion("What is YouTube Insights?", open=False):
269
+ gr.Markdown(
270
+ "YouTube Insights is a tool developed with academic purposes only, that creates summaries, keywords and sentiments analysis based on YouTube videos or user audio files."
271
+ )
272
+
273
  with gr.Accordion("How does it work?", open=False):
274
+ gr.Markdown(
275
+ "Works by using OpenAI's Whisper, BART for summarization and VoiceLabT5 for Keyword Extraction."
276
+ )
277
 
278
+ gr.HTML(
279
+ """
280
  <div style="text-align: center; max-width: 500px; margin: 0 auto;">
281
  <p style="margin-bottom: 10px; font-size: 96%">
282
  2023 Master in Big Data & Data Science - Universidad Complutense de Madrid
283
  </p>
284
  </div>
285
+ """
286
+ )
287
 
288
+ demo.launch()