import gradio as gr from transformers import CLIPProcessor, CLIPModel from PIL import Image import torch # Load the model and processor model = CLIPModel.from_pretrained("geolocal/StreetCLIP") processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP") def classify_image(image): # Example labels for classification labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a tree"] # Preprocess the image and text inputs = processor(text=labels, images=image, return_tensors="pt", padding=True) # Perform the inference outputs = model(**inputs) # Postprocess the outputs logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # we can use softmax to get probabilities # Convert the probabilities to a list probs_list = probs.tolist()[0] # Create a dictionary of labels and probabilities result = {label: prob for label, prob in zip(labels, probs_list)} return result # Define Gradio interface iface = gr.Interface( fn=classify_image, inputs=gr.Image(type="pil"), outputs="label", title="Geolocal StreetCLIP Classification", description="Upload an image to classify using Geolocal StreetCLIP" ) # Launch the interface iface.launch()