import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

# Set the device (CPU or CUDA)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)


# Define the function to answer questions
def answer_question(image, question):
    inputs = processor(images=image, text=question, return_tensors="pt").to(DEVICE)
    outputs = model.generate(**inputs)
    answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return answer


# Gradio interface
interface = gr.Interface(
    fn=answer_question,
    inputs=["image", "text"],
    outputs="text",
    title="SmolVLM - Vision-Language Question Answering",
    description="Upload an image and ask a question to get an answer powered by SmolVLM.",
)

if __name__ == "__main__":
    interface.launch()