Spaces:

hgdgng
/

HG_Llama3.2

Runtime error

App Files Files Community

HG_Llama3.2 / app.py

hgdgng

Update app.py

8a0ad15 verified 23 days ago

raw

history blame contribute delete

1.82 kB

	# Import required libraries
	import gradio as gr
	import os
	import torch
	from transformers import AutoProcessor, MllamaForConditionalGeneration
	from PIL import Image

	# Set up Hugging Face authentication
	hf_token = os.getenv("HF_KEY") # Get token from environment variable
	if not hf_token:
	raise ValueError("HF_KEY environment variable not set. Please set your Hugging Face token.")

	# Model configuration and loading
	model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
	model = MllamaForConditionalGeneration.from_pretrained(
	model_name,
	use_auth_token=hf_token,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token)

	# Define prediction function for image and text processing
	def predict(image, text):
	# Prepare messages
	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": text}
	]}
	]

	# Create input text
	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

	# Process inputs and move to device
	inputs = processor(image, input_text, return_tensors="pt").to(model.device)

	# Generate model response
	outputs = model.generate(**inputs, max_new_tokens=100)

	# Decode output
	response = processor.decode(outputs[0], skip_special_tokens=True)
	return response

	# Setup Gradio interface
	interface = gr.Interface(
	fn=predict,
	inputs=[
	gr.Image(type="pil", label="Image Input"),
	gr.Textbox(label="Text Input")
	],
	outputs=gr.Textbox(label="Output"),
	title="Llama 3.2 11B Vision Instruct Demo",
	description="Meta's new model that generates a response based on an image and text input."
	)

	# Launch the interface
	interface.launch()