Tonic commited on
Commit
4641b71
·
verified ·
1 Parent(s): bc7047f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -34
app.py CHANGED
@@ -1,45 +1,45 @@
1
- import os
2
  import requests
3
  from io import BytesIO
4
-
5
  from PIL import Image
6
  from transformers import AutoProcessor, AutoModelForVision2Seq
 
7
 
8
- def generate_caption(image):
9
- # Load pre-trained models & processors
10
- model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
11
- processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
12
-
13
- prompt = "<grounding>An image of"
14
 
15
- # Open the uploaded image file
16
- img = Image.open(BytesIO(image))
17
 
18
- # Save the image locally and open it again to avoid potential issues with reusing the same PIL object
19
- img.save("temp_image.jpg")
20
- img = Image.open("temp_image.jpg")
 
 
21
 
22
- inputs = processor(text=prompt, images=img, return_tensors="pt")
23
-
24
- # Generate caption
25
- generated_ids = model.generate(**inputs, max_new_tokens=128)
26
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
27
-
28
- # Process the generated caption
29
- processed_text, _ = processor.post_process_generation(generated_text)
30
-
31
- return processed_text
32
 
33
- import gradio as gr
 
 
34
 
35
- title = 'Image Caption Generator'
36
- description = 'Generate descriptive captions for images.'
37
- examples = [["PRO-b0fe1914d67344d98e120a19cd1aadf1.jpg"]]
38
- article = '<p style="margin:auto;max-width:600px;">This tool generates descriptive captions for given images.</p>'
39
 
40
- interface = gr.Interface(fn=generate_caption,
41
- inputs=gr.Image(),
42
- outputs=gr.Textbox(),
43
- title=title, description=description, examples=examples, article=article)
44
-
45
- interface.launch()
 
 
 
 
 
 
 
 
 
1
+ import torch
2
  import requests
3
  from io import BytesIO
 
4
  from PIL import Image
5
  from transformers import AutoProcessor, AutoModelForVision2Seq
6
+ import gradio as gr
7
 
8
+ def load_models():
9
+ # Load pre-trained models
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(device)
12
+ processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
13
+ return model, processor
14
 
 
 
15
 
16
+ def generate_description(image):
17
+ model, processor = load_models()
18
+ prompt = "<grounding>An image of"
19
+
20
+ inputs = processor(text=prompt, images=image, padding='max_length', truncation=True, return_tensors="pt")
21
 
22
+ # Move tensors to GPU if available
23
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
 
 
 
 
 
 
 
24
 
25
+ # Generate description
26
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
27
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
28
 
29
+ return generated_text
30
+
 
 
31
 
32
+ if __name__ == '__main__':
33
+ interface = gr.Interface(
34
+ generate_description,
35
+ ["image"],
36
+ "text",
37
+ capture_session=True,
38
+ allow_recording=False,
39
+ title="GPT-based Visual Storytelling",
40
+ description="Upload an image to get a detailed caption generated by our powerful AI!",
41
+ examples=[
42
+ ['PRO-b0fe1914d67344d98e120a19cd1aadf1.jpg']
43
+ ],
44
+ )
45
+ interface.launch()