import torch from transformers import MllamaForConditionalGeneration, AutoProcessor from PIL import Image import base64 import io # Load model and processor globally model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_id) def handler(event, context): try: # Parse inputs inputs = event.get('inputs', {}) image_base64 = inputs.get('image') prompt = inputs.get('prompt', '') if not image_base64 or not prompt: return {'error': 'Both "image" and "prompt" are required in inputs.'} # Decode the base64 image image_bytes = base64.b64decode(image_base64) image = Image.open(io.BytesIO(image_bytes)).convert('RGB') # Prepare the message messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": prompt} ]} ] input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs inputs = processor(image, input_text, return_tensors="pt").to(model.device) # Generate output output_ids = model.generate(**inputs, max_new_tokens=50) generated_text = processor.decode(output_ids[0], skip_special_tokens=True) # Return the result return {'generated_text': generated_text} except Exception as e: return {'error': str(e)} #111