|
from PIL import Image |
|
import requests |
|
|
|
from transformers import CLIPProcessor, CLIPModel |
|
|
|
def food_not_food(input_image): |
|
model = CLIPModel.from_pretrained("flax-community/clip-rsicd-v2") |
|
processor = CLIPProcessor.from_pretrained("flax-community/clip-rsicd-v2") |
|
|
|
labels = ["food", "not food"] |
|
inputs = processor(text=[f"a photo of a {l}" for l in labels], images=input_image, return_tensors="pt", padding=True) |
|
|
|
outputs = model(**inputs) |
|
logits_per_image = outputs.logits_per_image |
|
prob = logits_per_image.softmax(dim=1).detach().cpu().numpy().argmax(axis=1) |
|
return labels[prob[0]] |