Spaces:

ashish-soni08
/

Generate-with-OpenAI-CLIP

Sleeping

File size: 3,558 Bytes

dca16ec
 
 
 
bf88188
dca16ec
 
 
 
 
 
 
bf88188
 
 
 
 
 
 
 
 
 
 
 
 
dca16ec
bf88188
dca16ec
 
 
bf88188
dca16ec
 
 
bf88188
dca16ec
bf88188
dca16ec
 
bf88188
dca16ec
 
 
 
a693c8e
dca16ec

import streamlit as st
import torch
import clip
from PIL import Image
from typing import List, Tuple
import numpy as np

# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function to predict descriptions and probabilities
def predict(image: Image.Image, descriptions: List[str]) -> Tuple[str, float]:
    """
    Predict the best matching description for the provided image based on the given descriptions.
    Uses the CLIP model to compute similarities between the image and text descriptions.

    Args:
        image (Image.Image): The input image for which the descriptions are being evaluated.
        descriptions (List[str]): A list of textual descriptions to compare against the image.

    Returns:
        Tuple[str, float]: A tuple containing the best-matching description and the corresponding probability.
    """
    # Preprocess the image and move it to the appropriate device
    image = preprocess(image).unsqueeze(0).to(device)
    # Tokenize the descriptions and move them to the appropriate device
    text = clip.tokenize(descriptions).to(device)

    with torch.no_grad():
        # Encode image and text features using the CLIP model
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        # Compute the similarity scores (logits) between image and text
        logits_per_image, logits_per_text = model(image, text)
        # Convert logits to probabilities
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    # Return the description with the highest probability and the corresponding probability
    return descriptions[np.argmax(probs)], np.max(probs)

# Streamlit app
def main():
    st.title("Image understanding model")

    # Instructions for the user
    st.markdown("---")
    st.markdown("### Upload an image to test how well the model understands it")

    # Upload image through Streamlit with a unique key
    uploaded_image = st.file_uploader("Upload an image...", type=["jpg", "png", "jpeg"], key="uploaded_image")

    if uploaded_image is not None:
        # Convert the uploaded image to PIL Image
        pil_image = Image.open(uploaded_image)

        # Limit the height of the displayed image to 400px
        st.image(pil_image, caption="Uploaded Image.", use_column_width=True, width=200)
        
        # Instructions for the user
        st.markdown("### 2 Lies and 1 Truth")
        st.markdown("Write 3 descriptions about the image, 1 must be true.")

        # Get user input for descriptions
        description1 = st.text_input("Description 1:", placeholder='A red apple')
        description2 = st.text_input("Description 2:", placeholder='A car parked in a garage')
        description3 = st.text_input("Description 3:", placeholder='An orange fruit on a tree')

        descriptions = [description1, description2, description3]

        # Button to trigger prediction
        if st.button("Predict"):
            if all(descriptions):
                # Make predictions
                best_description, best_prob = predict(pil_image, descriptions)

                # Display the highest probability description and its probability
                st.write(f"**Best Description:** {best_description}")
                st.write(f"**Prediction Probability:** {best_prob:.2%}")

                # Display progress bar for the highest probability
                st.progress(float(best_prob))

if __name__ == "__main__":
    main()