Spaces:
Runtime error
Runtime error
# Import required libraries | |
import gradio as gr | |
import os | |
import torch | |
from transformers import AutoProcessor, MllamaForConditionalGeneration | |
from PIL import Image | |
# Set up Hugging Face authentication | |
hf_token = os.getenv("HF_KEY") # Get token from environment variable | |
if not hf_token: | |
raise ValueError("HF_KEY environment variable not set. Please set your Hugging Face token.") | |
# Model configuration and loading | |
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" | |
model = MllamaForConditionalGeneration.from_pretrained( | |
model_name, | |
use_auth_token=hf_token, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
) | |
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token) | |
# Define prediction function for image and text processing | |
def predict(image, text): | |
# Prepare messages | |
messages = [ | |
{"role": "user", "content": [ | |
{"type": "image"}, | |
{"type": "text", "text": text} | |
]} | |
] | |
# Create input text | |
input_text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
# Process inputs and move to device | |
inputs = processor(image, input_text, return_tensors="pt").to(model.device) | |
# Generate model response | |
outputs = model.generate(**inputs, max_new_tokens=100) | |
# Decode output | |
response = processor.decode(outputs[0], skip_special_tokens=True) | |
return response | |
# Setup Gradio interface | |
interface = gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Image(type="pil", label="Image Input"), | |
gr.Textbox(label="Text Input") | |
], | |
outputs=gr.Textbox(label="Output"), | |
title="Llama 3.2 11B Vision Instruct Demo", | |
description="Meta's new model that generates a response based on an image and text input." | |
) | |
# Launch the interface | |
interface.launch() |