scr930's picture
Update app.py
1848536 verified
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
# Load the model and processor
model = CLIPModel.from_pretrained("geolocal/StreetCLIP")
processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP")
def classify_image(image):
# Example labels for classification
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a tree"]
# Preprocess the image and text
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
# Perform the inference
outputs = model(**inputs)
# Postprocess the outputs
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can use softmax to get probabilities
# Convert the probabilities to a list
probs_list = probs.tolist()[0]
# Create a dictionary of labels and probabilities
result = {label: prob for label, prob in zip(labels, probs_list)}
return result
# Define Gradio interface
iface = gr.Interface(
fn=classify_image,
inputs=gr.Image(type="pil"),
outputs="label",
title="Geolocal StreetCLIP Classification",
description="Upload an image to classify using Geolocal StreetCLIP"
)
# Launch the interface
iface.launch()