import gradio as gr from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import torch # Initialize BLIP model and processor device = torch.device("cuda" if torch.cuda.is_available() else "cpu") processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device) def caption_image(image): inputs = processor(images=image, return_tensors="pt").to(device) out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) return caption def process_image(image): # Convert the input image to PIL Image image = Image.fromarray(image) # Get the caption caption = caption_image(image) return caption # Create Gradio Interface interface = gr.Interface( fn=process_image, inputs=gr.Image(type="numpy", label="Upload Image"), outputs=gr.Textbox(label="Caption"), title="BLIP Image Captioning", description="Upload an image to get a caption generated by the BLIP model." ) # Launch the Gradio app if __name__ == "__main__": interface.launch()