import gradio as gr
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Initialize BLIP model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

def caption_image(image):
    inputs = processor(images=image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

def process_image(image):
    # Convert the input image to PIL Image
    image = Image.fromarray(image)
    # Get the caption
    caption = caption_image(image)
    return caption

# Create Gradio Interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="numpy", label="Upload Image"),
    outputs=gr.Textbox(label="Caption"),
    title="BLIP Image Captioning",
    description="Upload an image to get a caption generated by the BLIP model."
)

# Launch the Gradio app
if __name__ == "__main__":
    interface.launch()