fashxp commited on
Commit
83bfe0f
1 Parent(s): 237340f

initial commit

Browse files
Files changed (3) hide show
  1. .gitignore +3 -0
  2. README.md +38 -0
  3. handler.py +46 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # PhpStorm / IDEA
2
+ .idea
3
+
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - vision
4
+ - image-to-text
5
+ - endpoints-template
6
+ inference: false
7
+ pipeline_tag: image-to-text
8
+ base_model: Salesforce/blip-image-captioning-large
9
+ library_name: generic
10
+ ---
11
+
12
+ # Fork of [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) for a `image-to-text` Inference endpoint.
13
+
14
+ > Inspired by https://huggingface.co/sergeipetrov/blip_captioning
15
+
16
+ This repository implements a `custom` task for `image-to-text` for 🤗 Inference Endpoints to allow image capturing.
17
+ The code for the customized pipeline is in the handler.py.
18
+
19
+ To use deploy this model an Inference Endpoint you have to select `Custom` as task to use the `handler.py` file.
20
+
21
+ ### expected Request payload
22
+
23
+ Image to be labeled as binary.
24
+
25
+ #### CURL
26
+
27
+ ```
28
+ curl URL \
29
+ -X POST \
30
+ --data-binary @car.png \
31
+ -H "Content-Type: image/png"
32
+ ```
33
+
34
+ #### Python
35
+
36
+ ```python
37
+ requests.post(ENDPOINT_URL, headers={"Content-Type": "image/png"}, data=open("car.png", 'rb').read()).json()
38
+ ```
handler.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # +
2
+ from typing import Dict, List, Any
3
+ from PIL import Image
4
+ import torch
5
+ import os
6
+ from io import BytesIO
7
+ from transformers import BlipForConditionalGeneration, BlipProcessor
8
+ # -
9
+
10
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+
12
+ class EndpointHandler():
13
+ def __init__(self, path=""):
14
+ # load the optimized model
15
+
16
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
17
+ self.model = BlipForConditionalGeneration.from_pretrained(
18
+ "Salesforce/blip-image-captioning-base"
19
+ ).to(device)
20
+ self.model.eval()
21
+ self.model = self.model.to(device)
22
+
23
+
24
+ def __call__(self, data: Any) -> List[Dict[str, Any]]:
25
+ """
26
+ Args:
27
+ data (:obj:):
28
+ binary image data to be labeled
29
+ Return:
30
+ A :obj:`list`:. The list contains an item with generated caption, like [{"generated_text": ["A hugging face at the office"]}] :
31
+ - "generated_text": A string corresponding to the generated caption.
32
+ """
33
+ inputs = data.pop("inputs", data)
34
+ parameters = data.pop("parameters", {})
35
+
36
+ processed_image = self.processor(images=inputs, return_tensors="pt")
37
+ processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
38
+ processed_image = {**processed_image, **parameters}
39
+
40
+ with torch.no_grad():
41
+ out = self.model.generate(
42
+ **processed_image
43
+ )
44
+ captions = self.processor.batch_decode(out, skip_special_tokens=True)
45
+ # postprocess the prediction
46
+ return [{"generated_text": captions}]