Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,337 Bytes
99f09a0 2f8eff4 cd7160b 2f8eff4 4ba32da 6b51df0 4ba32da 2f8eff4 ecebb24 2f8eff4 d67efb9 835d83d 2f8eff4 f6ae4ba 2f8eff4 7171a2f 2f8eff4 7171a2f 2f8eff4 7171a2f 2f8eff4 7171a2f 2f8eff4 99f09a0 2f8eff4 99f09a0 2f8eff4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
import requests
import torch
from PIL import Image
import spaces
from transformers import MllamaForConditionalGeneration, AutoProcessor
import os
from huggingface_hub import login
huggingface_token = os.getenv("SECRET_ENV_VARIABLE")
login(huggingface_token)
# Load the Llama 3.2 Vision Model
def load_llama_model():
model_id = "meta-llama/Llama-3.2-11B-Vision"
# Load model and processor
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
offload_folder="offload",
)
model.tie_weights()
processor = AutoProcessor.from_pretrained(model_id)
return model, processor
# Function to generate predictions for text and image
@spaces.GPU
def process_input(text, image=None):
model, processor = load_llama_model()
if image:
# If an image is uploaded, process it as a PIL Image object
vision_input = image.convert("RGB").resize((224, 224))
prompt = f"<|image|><|begin_of_text|>{text}"
# Process image and text together
inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
else:
# If no image is uploaded, just process the text
prompt = f"<|begin_of_text|>{text}"
inputs = processor(prompt, return_tensors="pt").to(model.device)
# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=100)
# Decode the output to return a readable text
decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
return decoded_output
# Gradio Interface Setup
# def demo():
# # Define Gradio input and output components
# text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
# # Use type="pil" to work with PIL Image objects
# image_input = gr.Image(label="Upload an Image", type="pil")
# output = gr.Textbox(label="Model Output", lines=5)
# # Define the interface layout
# interface = gr.Interface(
# fn=process_input,
# inputs=[text_input, image_input],
# outputs=output,
# title="Llama 3.2 Multimodal Text-Image Analyzer",
# description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
# )
# # Launch the demo
# interface.launch()
def demo():
# Define Gradio input and output components
text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
image_input = gr.Image(label="Upload an Image", type="pil")
output = gr.Textbox(label="Model Output", lines=5)
# Add two examples for multimodal analysis
examples = [
["The llama is ", "./examples/rococo.jpg"],
["The cute hampster is wearing ", "./examples/weather_events.png"]
]
# Define the interface layout
interface = gr.Interface(
fn=process_input,
inputs=[text_input, image_input],
outputs=output,
examples=examples,
title="Llama 3.2 Multimodal Text-Image Analyzer",
description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.",
)
# Launch the demo
interface.launch()
# Run the demo
if __name__ == "__main__":
demo()
|