fawadrashid commited on
Commit
f42c65b
·
verified ·
1 Parent(s): c545edf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -6
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import os
 
2
  import gradio as gr
3
- from helper import load_image_from_url, render_results_in_image
 
4
  from transformers import pipeline
5
  from transformers.utils import logging
6
  logging.set_verbosity_error()
@@ -9,7 +11,12 @@ from helper import ignore_warnings
9
  ignore_warnings()
10
 
11
 
12
- od_pipe = pipeline("object-detection", "./models/facebook/detr-resnet-50")
 
 
 
 
 
13
 
14
  def get_pipeline_prediction(pil_image):
15
 
@@ -17,14 +24,26 @@ def get_pipeline_prediction(pil_image):
17
 
18
  processed_image = render_results_in_image(pil_image,
19
  pipeline_output)
20
- return processed_image
 
 
 
 
 
 
 
 
 
 
21
 
22
  demo = gr.Interface(
23
  fn=get_pipeline_prediction,
24
  inputs=gr.Image(label="Input image",
25
  type="pil"),
26
- outputs=gr.Image(label="Output image with predicted instances",
27
- type="pil")
 
 
28
  )
29
 
30
- demo_blocks.queue().launch(server_name="0.0.0.0", server_port=7860)
 
1
  import os
2
+ from PIL import Image, ImageDraw, ImageFont
3
  import gradio as gr
4
+ from helper import load_image_from_url, render_results_in_image
5
+ from helper import summarize_predictions_natural_language
6
  from transformers import pipeline
7
  from transformers.utils import logging
8
  logging.set_verbosity_error()
 
11
  ignore_warnings()
12
 
13
 
14
+
15
+
16
+ od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")
17
+ tts_pipe = pipeline("text-to-speech",
18
+ model="kakao-enterprise/vits-ljs")
19
+
20
 
21
  def get_pipeline_prediction(pil_image):
22
 
 
24
 
25
  processed_image = render_results_in_image(pil_image,
26
  pipeline_output)
27
+
28
+ text = summarize_predictions_natural_language(pipeline_output)
29
+ print(text)
30
+ narrated_text = tts_pipe(text)
31
+
32
+ #print (narrated_text)
33
+ print(narrated_text["audio"][0])
34
+ print (narrated_text["sampling_rate"])
35
+ return processed_image, (narrated_text["sampling_rate"], narrated_text["audio"][0] )
36
+ #return processed_image
37
+
38
 
39
  demo = gr.Interface(
40
  fn=get_pipeline_prediction,
41
  inputs=gr.Image(label="Input image",
42
  type="pil"),
43
+ outputs=[gr.Image(label="Output image with predicted instances",
44
+ type="pil"), gr.Audio(label="Narration", type="numpy", autoplay=True)]
45
+ #outputs=gr.Image(label="Output image with predicted instances",
46
+ # type="pil")
47
  )
48
 
49
+ demo.launch(server_name="0.0.0.0", server_port=7860)