Spaces:
Sleeping
Sleeping
File size: 3,558 Bytes
dca16ec bf88188 dca16ec bf88188 dca16ec bf88188 dca16ec bf88188 dca16ec bf88188 dca16ec bf88188 dca16ec bf88188 dca16ec a693c8e dca16ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import streamlit as st
import torch
import clip
from PIL import Image
from typing import List, Tuple
import numpy as np
# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# Function to predict descriptions and probabilities
def predict(image: Image.Image, descriptions: List[str]) -> Tuple[str, float]:
"""
Predict the best matching description for the provided image based on the given descriptions.
Uses the CLIP model to compute similarities between the image and text descriptions.
Args:
image (Image.Image): The input image for which the descriptions are being evaluated.
descriptions (List[str]): A list of textual descriptions to compare against the image.
Returns:
Tuple[str, float]: A tuple containing the best-matching description and the corresponding probability.
"""
# Preprocess the image and move it to the appropriate device
image = preprocess(image).unsqueeze(0).to(device)
# Tokenize the descriptions and move them to the appropriate device
text = clip.tokenize(descriptions).to(device)
with torch.no_grad():
# Encode image and text features using the CLIP model
image_features = model.encode_image(image)
text_features = model.encode_text(text)
# Compute the similarity scores (logits) between image and text
logits_per_image, logits_per_text = model(image, text)
# Convert logits to probabilities
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
# Return the description with the highest probability and the corresponding probability
return descriptions[np.argmax(probs)], np.max(probs)
# Streamlit app
def main():
st.title("Image understanding model")
# Instructions for the user
st.markdown("---")
st.markdown("### Upload an image to test how well the model understands it")
# Upload image through Streamlit with a unique key
uploaded_image = st.file_uploader("Upload an image...", type=["jpg", "png", "jpeg"], key="uploaded_image")
if uploaded_image is not None:
# Convert the uploaded image to PIL Image
pil_image = Image.open(uploaded_image)
# Limit the height of the displayed image to 400px
st.image(pil_image, caption="Uploaded Image.", use_column_width=True, width=200)
# Instructions for the user
st.markdown("### 2 Lies and 1 Truth")
st.markdown("Write 3 descriptions about the image, 1 must be true.")
# Get user input for descriptions
description1 = st.text_input("Description 1:", placeholder='A red apple')
description2 = st.text_input("Description 2:", placeholder='A car parked in a garage')
description3 = st.text_input("Description 3:", placeholder='An orange fruit on a tree')
descriptions = [description1, description2, description3]
# Button to trigger prediction
if st.button("Predict"):
if all(descriptions):
# Make predictions
best_description, best_prob = predict(pil_image, descriptions)
# Display the highest probability description and its probability
st.write(f"**Best Description:** {best_description}")
st.write(f"**Prediction Probability:** {best_prob:.2%}")
# Display progress bar for the highest probability
st.progress(float(best_prob))
if __name__ == "__main__":
main()
|