Spaces:
Runtime error
Runtime error
""" | |
Using as reference: | |
- https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512 | |
- https://huggingface.co/spaces/chansung/segformer-tf-transformers/blob/main/app.py | |
- https://huggingface.co/facebook/detr-resnet-50-panoptic | |
# https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/ | |
https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_panoptic_segmentation_minimal_example_(with_DetrFeatureExtractor).ipynb | |
https://arxiv.org/abs/2005.12872 | |
Additions | |
- add shown labels as strings | |
- show only animal masks (ask an nlp model?) | |
""" | |
from transformers import DetrFeatureExtractor, DetrForSegmentation | |
from PIL import Image | |
import gradio as gr | |
import numpy as np | |
import torch | |
import torchvision | |
# Returns a list with a color per ADE class (150 classes) | |
# from https://huggingface.co/spaces/chansung/segformer-tf-transformers/blob/main/app.py | |
def ade_palette(): | |
"""ADE20K palette that maps each class to RGB values.""" | |
return [ | |
[120, 120, 120], | |
[180, 120, 120], | |
[6, 230, 230], | |
[80, 50, 50], | |
[4, 200, 3], | |
[120, 120, 80], | |
[140, 140, 140], | |
[204, 5, 255], | |
[230, 230, 230], | |
[4, 250, 7], | |
[224, 5, 255], | |
[235, 255, 7], | |
[150, 5, 61], | |
[120, 120, 70], | |
[8, 255, 51], | |
[255, 6, 82], | |
[143, 255, 140], | |
[204, 255, 4], | |
[255, 51, 7], | |
[204, 70, 3], | |
[0, 102, 200], | |
[61, 230, 250], | |
[255, 6, 51], | |
[11, 102, 255], | |
[255, 7, 71], | |
[255, 9, 224], | |
[9, 7, 230], | |
[220, 220, 220], | |
[255, 9, 92], | |
[112, 9, 255], | |
[8, 255, 214], | |
[7, 255, 224], | |
[255, 184, 6], | |
[10, 255, 71], | |
[255, 41, 10], | |
[7, 255, 255], | |
[224, 255, 8], | |
[102, 8, 255], | |
[255, 61, 6], | |
[255, 194, 7], | |
[255, 122, 8], | |
[0, 255, 20], | |
[255, 8, 41], | |
[255, 5, 153], | |
[6, 51, 255], | |
[235, 12, 255], | |
[160, 150, 20], | |
[0, 163, 255], | |
[140, 140, 140], | |
[250, 10, 15], | |
[20, 255, 0], | |
[31, 255, 0], | |
[255, 31, 0], | |
[255, 224, 0], | |
[153, 255, 0], | |
[0, 0, 255], | |
[255, 71, 0], | |
[0, 235, 255], | |
[0, 173, 255], | |
[31, 0, 255], | |
[11, 200, 200], | |
[255, 82, 0], | |
[0, 255, 245], | |
[0, 61, 255], | |
[0, 255, 112], | |
[0, 255, 133], | |
[255, 0, 0], | |
[255, 163, 0], | |
[255, 102, 0], | |
[194, 255, 0], | |
[0, 143, 255], | |
[51, 255, 0], | |
[0, 82, 255], | |
[0, 255, 41], | |
[0, 255, 173], | |
[10, 0, 255], | |
[173, 255, 0], | |
[0, 255, 153], | |
[255, 92, 0], | |
[255, 0, 255], | |
[255, 0, 245], | |
[255, 0, 102], | |
[255, 173, 0], | |
[255, 0, 20], | |
[255, 184, 184], | |
[0, 31, 255], | |
[0, 255, 61], | |
[0, 71, 255], | |
[255, 0, 204], | |
[0, 255, 194], | |
[0, 255, 82], | |
[0, 10, 255], | |
[0, 112, 255], | |
[51, 0, 255], | |
[0, 194, 255], | |
[0, 122, 255], | |
[0, 255, 163], | |
[255, 153, 0], | |
[0, 255, 10], | |
[255, 112, 0], | |
[143, 255, 0], | |
[82, 0, 255], | |
[163, 255, 0], | |
[255, 235, 0], | |
[8, 184, 170], | |
[133, 0, 255], | |
[0, 255, 92], | |
[184, 0, 255], | |
[255, 0, 31], | |
[0, 184, 255], | |
[0, 214, 255], | |
[255, 0, 112], | |
[92, 255, 0], | |
[0, 224, 255], | |
[112, 224, 255], | |
[70, 184, 160], | |
[163, 0, 255], | |
[153, 0, 255], | |
[71, 255, 0], | |
[255, 0, 163], | |
[255, 204, 0], | |
[255, 0, 143], | |
[0, 255, 235], | |
[133, 255, 0], | |
[255, 0, 235], | |
[245, 0, 255], | |
[255, 0, 122], | |
[255, 245, 0], | |
[10, 190, 212], | |
[214, 255, 0], | |
[0, 204, 255], | |
[20, 0, 255], | |
[255, 255, 0], | |
[0, 153, 255], | |
[0, 41, 255], | |
[0, 255, 204], | |
[41, 0, 255], | |
[41, 255, 0], | |
[173, 0, 255], | |
[0, 245, 255], | |
[71, 0, 255], | |
[122, 0, 255], | |
[0, 255, 184], | |
[0, 92, 255], | |
[184, 255, 0], | |
[0, 133, 255], | |
[255, 214, 0], | |
[25, 194, 194], | |
[102, 255, 0], | |
[92, 0, 255], | |
] | |
def predict_animal_mask(im, | |
gr_slider_confidence): | |
image = Image.fromarray(im) # im: numpy array 3d: 480, 640, 3: to PIL Image | |
image = image.resize((200,200)) # PIL image # could I upsample output instead? better? | |
# encoding is a dict with pixel_values and pixel_mask | |
encoding = feature_extractor(images=image, return_tensors="pt") #pt=Pytorch, tf=TensorFlow | |
outputs = model(**encoding) # odict with keys: ['logits', 'pred_boxes', 'pred_masks', 'last_hidden_state', 'encoder_last_hidden_state'] | |
logits = outputs.logits # torch.Size([1, 100, 251]); why 251? | |
bboxes = outputs.pred_boxes | |
masks = outputs.pred_masks # torch.Size([1, 100, 200, 200]); for every pixel, score in each of the 100 classes? there is a mask per class | |
# keep only the masks with high confidence?-------------------------------- | |
# compute the prob per mask (i.e., class), excluding the "no-object" class (the last one) | |
prob_per_query = outputs.logits.softmax(-1)[..., :-1].max(-1)[0] # why logits last dim 251? | |
# threshold the confidence | |
keep = prob_per_query > gr_slider_confidence/100.0 | |
# postprocess the mask (numpy arrays) | |
label_per_pixel = torch.argmax(masks[keep].squeeze(),dim=0).detach().numpy() # from the masks per class, select the highest per pixel | |
color_mask = np.zeros(image.size+(3,)) | |
for lbl, color in enumerate(ade_palette()): | |
color_mask[label_per_pixel==lbl,:] = color | |
# Show image + mask | |
pred_img = np.array(image.convert('RGB'))*0.5 + color_mask*0.5 | |
pred_img = pred_img.astype(np.uint8) | |
return pred_img | |
####################################### | |
# get models from hugging face | |
feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50-panoptic') | |
model = DetrForSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic') | |
# gradio components -inputs | |
gr_image_input = gr.inputs.Image() | |
gr_slider_confidence = gr.inputs.Slider(0,100,5,85, | |
label='Set confidence threshold for masks') | |
# gradio outputs | |
gr_image_output = gr.outputs.Image() | |
#################################################### | |
# Create user interface and launch | |
gr.Interface(predict_animal_mask, | |
inputs = [gr_image_input,gr_slider_confidence], | |
outputs = gr_image_output, | |
title = 'Image segmentation with varying confidence', | |
description = "An image segmentation webapp using DETR (End-to-End Object Detection) model with ResNet-50 backbone").launch() | |
#################################### | |
# url = "http://images.cocodataset.org/val2017/000000039769.jpg" | |
# image = Image.open(requests.get(url, stream=True).raw) | |
# inputs = feature_extractor(images=image, return_tensors="pt") | |
# outputs = model(**inputs) | |
# logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4) | |