File size: 3,337 Bytes
99f09a0
2f8eff4
 
 
cd7160b
2f8eff4
4ba32da
6b51df0
4ba32da
 
 
2f8eff4
 
 
ecebb24
2f8eff4
 
 
 
 
d67efb9
835d83d
 
 
2f8eff4
 
 
 
 
f6ae4ba
2f8eff4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7171a2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f8eff4
 
 
 
 
 
7171a2f
 
 
 
 
 
2f8eff4
 
 
 
 
7171a2f
2f8eff4
7171a2f
2f8eff4
99f09a0
2f8eff4
 
99f09a0
2f8eff4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import requests
import torch
from PIL import Image
import spaces
from transformers import MllamaForConditionalGeneration, AutoProcessor
import os
from huggingface_hub import login

huggingface_token = os.getenv("SECRET_ENV_VARIABLE")
login(huggingface_token)

# Load the Llama 3.2 Vision Model
def load_llama_model():
    model_id = "meta-llama/Llama-3.2-11B-Vision"

    # Load model and processor
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        offload_folder="offload", 
    )
    model.tie_weights() 
    processor = AutoProcessor.from_pretrained(model_id)

    return model, processor

# Function to generate predictions for text and image
@spaces.GPU
def process_input(text, image=None):
    model, processor = load_llama_model()

    if image:
        # If an image is uploaded, process it as a PIL Image object
        vision_input = image.convert("RGB").resize((224, 224))

        prompt = f"<|image|><|begin_of_text|>{text}"

        # Process image and text together
        inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
    else:
        # If no image is uploaded, just process the text
        prompt = f"<|begin_of_text|>{text}"
        inputs = processor(prompt, return_tensors="pt").to(model.device)

    # Generate output from the model
    outputs = model.generate(**inputs, max_new_tokens=100)

    # Decode the output to return a readable text
    decoded_output = processor.decode(outputs[0], skip_special_tokens=True)

    return decoded_output

# Gradio Interface Setup
# def demo():
#     # Define Gradio input and output components
#     text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)

#     # Use type="pil" to work with PIL Image objects
#     image_input = gr.Image(label="Upload an Image", type="pil")

#     output = gr.Textbox(label="Model Output", lines=5)

#     # Define the interface layout
#     interface = gr.Interface(
#         fn=process_input,
#         inputs=[text_input, image_input],
#         outputs=output,
#         title="Llama 3.2 Multimodal Text-Image Analyzer",
#         description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
#     )

#     # Launch the demo
#     interface.launch()

def demo():
    # Define Gradio input and output components
    text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
    image_input = gr.Image(label="Upload an Image", type="pil")
    output = gr.Textbox(label="Model Output", lines=5)

    # Add two examples for multimodal analysis
    examples = [
        ["The llama is ", "./examples/rococo.jpg"],
        ["The cute hampster is wearing ", "./examples/weather_events.png"]
    ]

    # Define the interface layout
    interface = gr.Interface(
        fn=process_input,
        inputs=[text_input, image_input],
        outputs=output,
        examples=examples,
        title="Llama 3.2 Multimodal Text-Image Analyzer",
        description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.",
    )

    # Launch the demo
    interface.launch()

# Run the demo
if __name__ == "__main__":
    demo()