|
import gradio as gr |
|
from transformers import BlipForConditionalGeneration, AutoProcessor |
|
from PIL import Image |
|
import torch |
|
|
|
|
|
processor = AutoProcessor.from_pretrained("blip-fine-tuned/") |
|
processor.tokenizer.padding_size = 'left' |
|
model = BlipForConditionalGeneration.from_pretrained("blip-fine-tuned/") |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
def predict(image): |
|
|
|
inputs = processor(images=image, return_tensors="pt").to(device) |
|
pixel_values = inputs.pixel_values |
|
|
|
|
|
with torch.no_grad(): |
|
generated_ids = model.generate(pixel_values=pixel_values, max_length=100) |
|
|
|
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
return generated_caption |
|
|
|
|
|
|
|
interface = gr.Interface( |
|
fn=predict, |
|
inputs=gr.Image(type="pil"), |
|
outputs="text", |
|
title="BLIP Image Caption Generator", |
|
description="Upload an image or select a sample to generate a descriptive caption." |
|
) |
|
interface.launch() |