storresbusquets commited on
Commit
79513d7
·
1 Parent(s): 9db221b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -14
app.py CHANGED
@@ -30,6 +30,7 @@ class GradioInference():
30
  if size != self.current_size:
31
  self.loaded_model = whisper.load_model(size)
32
  self.current_size = size
 
33
  results = self.loaded_model.transcribe(path, language=lang)
34
 
35
  # Perform summarization on the transcription
@@ -52,10 +53,30 @@ class GradioInference():
52
  return self.yt.thumbnail_url, self.yt.title
53
 
54
 
55
- def transcribe_audio(audio_file):
56
- model = whisper.load_model("base")
57
- result = model.transcribe(audio_file)
58
- return result["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  gio = GradioInference()
@@ -82,14 +103,14 @@ with block as demo:
82
  with gr.Row().style(equal_height=True):
83
  size = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
84
  lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
85
- link = gr.Textbox(label="YouTube Link")
86
  title = gr.Label(label="Video Title")
87
  with gr.Row().style(equal_height=True):
88
  img = gr.Image(label="Thumbnail")
89
- text = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10).style(show_copy_button=True, container=True)
90
  with gr.Row().style(equal_height=True):
91
- summary = gr.Textbox(label="Summary", placeholder="Summary Output", lines=5).style(show_copy_button=True, container=True)
92
- keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output", lines=5).style(show_copy_button=True, container=True)
93
  label = gr.Label(label="Sentiment Analysis")
94
  with gr.Row().style(equal_height=True):
95
  clear = gr.ClearButton([link, title, img, text, summary, keywords, label], scale=1)
@@ -104,16 +125,16 @@ with block as demo:
104
  lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
105
  audio_file = gr.Audio(type="filepath")
106
  with gr.Row().style(equal_height=True):
107
- # img = gr.Image(label="Thumbnail")
108
- text = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10).style(show_copy_button=True, container=False)
109
  # with gr.Row().style(equal_height=True):
110
- # summary = gr.Textbox(label="Summary", placeholder="Summary Output", lines=5)
111
- # keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output", lines=5)
 
112
  with gr.Row().style(equal_height=True):
113
  clear = gr.ClearButton([text], scale=1)
114
  btn = gr.Button("Get video insights", variant='primary', scale=1) # Updated button label
115
- btn.click(transcribe_audio, inputs=[audio_file], outputs=[text])
116
- # link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
117
 
118
  with block:
119
  gr.Markdown("About the app:")
 
30
  if size != self.current_size:
31
  self.loaded_model = whisper.load_model(size)
32
  self.current_size = size
33
+
34
  results = self.loaded_model.transcribe(path, language=lang)
35
 
36
  # Perform summarization on the transcription
 
53
  return self.yt.thumbnail_url, self.yt.title
54
 
55
 
56
+ def from_audio_input(self, lang, size, audio_file):
57
+ if lang == "none":
58
+ lang = None
59
+
60
+ if size != self.current_size:
61
+ self.loaded_model = whisper.load_model(size)
62
+ self.current_size = size
63
+
64
+ results = self.loaded_model.transcribe(audio_file, language=lang)
65
+
66
+ # Perform summarization on the transcription
67
+ transcription_summary = self.summarizer(results["text"], max_length=130, min_length=30, do_sample=False)
68
+
69
+ # Extract keywords using VoiceLabT5
70
+ task_prefix = "Keywords: "
71
+ input_sequence = task_prefix + results["text"]
72
+ input_ids = self.keyword_tokenizer(input_sequence, return_tensors="pt", truncation=False).input_ids
73
+ output = self.keyword_model.generate(input_ids, no_repeat_ngram_size=3, num_beams=4)
74
+ predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
75
+ keywords = [x.strip() for x in predicted.split(',') if x.strip()]
76
+
77
+ label = self.classifier(results["text"])[0]["label"]
78
+
79
+ return results["text"], transcription_summary[0]["summary_text"], keywords, label
80
 
81
 
82
  gio = GradioInference()
 
103
  with gr.Row().style(equal_height=True):
104
  size = gr.Dropdown(label="Model Size", choices=gio.sizes, value='base')
105
  lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
106
+ link = gr.Textbox(label="YouTube Link", placeholder="Enter YouTube link...")
107
  title = gr.Label(label="Video Title")
108
  with gr.Row().style(equal_height=True):
109
  img = gr.Image(label="Thumbnail")
110
+ text = gr.Textbox(label="Transcription", placeholder="Transcription Output...", lines=10).style(show_copy_button=True, container=True)
111
  with gr.Row().style(equal_height=True):
112
+ summary = gr.Textbox(label="Summary", placeholder="Summary Output...", lines=5).style(show_copy_button=True, container=True)
113
+ keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output...", lines=5).style(show_copy_button=True, container=True)
114
  label = gr.Label(label="Sentiment Analysis")
115
  with gr.Row().style(equal_height=True):
116
  clear = gr.ClearButton([link, title, img, text, summary, keywords, label], scale=1)
 
125
  lang = gr.Dropdown(label="Language (Optional)", choices=gio.langs, value="none")
126
  audio_file = gr.Audio(type="filepath")
127
  with gr.Row().style(equal_height=True):
128
+ text = gr.Textbox(label="Transcription", placeholder="Transcription Output...", lines=10).style(show_copy_button=True, container=False)
 
129
  # with gr.Row().style(equal_height=True):
130
+ summary = gr.Textbox(label="Summary", placeholder="Summary Output", lines=5)
131
+ keywords = gr.Textbox(label="Keywords", placeholder="Keywords Output", lines=5)
132
+ label = gr.Label(label="Sentiment Analysis")
133
  with gr.Row().style(equal_height=True):
134
  clear = gr.ClearButton([text], scale=1)
135
  btn = gr.Button("Get video insights", variant='primary', scale=1) # Updated button label
136
+ btn.click(gio.from_audio_input, inputs=[lang, size, audio_file], outputs=[text, summary, keywords, label])
137
+
138
 
139
  with block:
140
  gr.Markdown("About the app:")