import gradio as gr import torch from transformers import AutoProcessor, AutoModelForVision2Seq # Set the device (CPU or CUDA) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Initialize processor and model processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") model = AutoModelForVision2Seq.from_pretrained( "HuggingFaceTB/SmolVLM-Instruct", torch_dtype=torch.bfloat16, _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", ).to(DEVICE) # Define the function to answer questions def answer_question(image, question): inputs = processor(images=image, text=question, return_tensors="pt").to(DEVICE) outputs = model.generate(**inputs) answer = processor.batch_decode(outputs, skip_special_tokens=True)[0] return answer # Gradio interface interface = gr.Interface( fn=answer_question, inputs=["image", "text"], outputs="text", title="SmolVLM - Vision-Language Question Answering", description="Upload an image and ask a question to get an answer powered by SmolVLM.", ) if __name__ == "__main__": interface.launch()