File size: 3,558 Bytes
dca16ec
 
 
 
bf88188
dca16ec
 
 
 
 
 
 
bf88188
 
 
 
 
 
 
 
 
 
 
 
 
dca16ec
bf88188
dca16ec
 
 
bf88188
dca16ec
 
 
bf88188
dca16ec
bf88188
dca16ec
 
bf88188
dca16ec
 
 
 
a693c8e
dca16ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
import torch
import clip
from PIL import Image
from typing import List, Tuple
import numpy as np

# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function to predict descriptions and probabilities
def predict(image: Image.Image, descriptions: List[str]) -> Tuple[str, float]:
    """
    Predict the best matching description for the provided image based on the given descriptions.
    Uses the CLIP model to compute similarities between the image and text descriptions.

    Args:
        image (Image.Image): The input image for which the descriptions are being evaluated.
        descriptions (List[str]): A list of textual descriptions to compare against the image.

    Returns:
        Tuple[str, float]: A tuple containing the best-matching description and the corresponding probability.
    """
    # Preprocess the image and move it to the appropriate device
    image = preprocess(image).unsqueeze(0).to(device)
    # Tokenize the descriptions and move them to the appropriate device
    text = clip.tokenize(descriptions).to(device)

    with torch.no_grad():
        # Encode image and text features using the CLIP model
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        # Compute the similarity scores (logits) between image and text
        logits_per_image, logits_per_text = model(image, text)
        # Convert logits to probabilities
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    # Return the description with the highest probability and the corresponding probability
    return descriptions[np.argmax(probs)], np.max(probs)

# Streamlit app
def main():
    st.title("Image understanding model")

    # Instructions for the user
    st.markdown("---")
    st.markdown("### Upload an image to test how well the model understands it")

    # Upload image through Streamlit with a unique key
    uploaded_image = st.file_uploader("Upload an image...", type=["jpg", "png", "jpeg"], key="uploaded_image")

    if uploaded_image is not None:
        # Convert the uploaded image to PIL Image
        pil_image = Image.open(uploaded_image)

        # Limit the height of the displayed image to 400px
        st.image(pil_image, caption="Uploaded Image.", use_column_width=True, width=200)
        
        # Instructions for the user
        st.markdown("### 2 Lies and 1 Truth")
        st.markdown("Write 3 descriptions about the image, 1 must be true.")

        # Get user input for descriptions
        description1 = st.text_input("Description 1:", placeholder='A red apple')
        description2 = st.text_input("Description 2:", placeholder='A car parked in a garage')
        description3 = st.text_input("Description 3:", placeholder='An orange fruit on a tree')

        descriptions = [description1, description2, description3]

        # Button to trigger prediction
        if st.button("Predict"):
            if all(descriptions):
                # Make predictions
                best_description, best_prob = predict(pil_image, descriptions)

                # Display the highest probability description and its probability
                st.write(f"**Best Description:** {best_description}")
                st.write(f"**Prediction Probability:** {best_prob:.2%}")

                # Display progress bar for the highest probability
                st.progress(float(best_prob))

if __name__ == "__main__":
    main()