import streamlit as st from transformers import ViltProcessor, ViltForQuestionAnswering, BlipForQuestionAnswering, AutoProcessor from PIL import Image # Define available models models = { "ViLT": (ViltProcessor, ViltForQuestionAnswering, "dandelin/vilt-b32-finetuned-vqa"), "BLIP": (AutoProcessor, BlipForQuestionAnswering, "Salesforce/blip-vqa-base"), } def get_format_response(image,question,selected_model): # Load selected model and processor processor, model_class, model_name = models[selected_model] processor = processor.from_pretrained(model_name) model = model_class.from_pretrained(model_name) encoding = processor(image, question, return_tensors="pt") outputs = model(**encoding) if selected_model=='ViLT': logits = outputs.logits idx = logits.argmax(-1).item() answer = model.config.id2label[idx] return answer else: answer = processor.decode(out[0], skip_special_tokens=True) return answer # Streamlit app st.title("Simple VQA App 🤖🎈") st.subheader("A demo app showcasing VQA models. ViLT and BLIP model.") # Sidebar for model selection selected_model = st.sidebar.selectbox("Select Model", list(models.keys())) # Image and question input uploaded_image = st.file_uploader("Upload Image") question = st.text_input("Ask a Question about the Image") # Process image and question if provided if uploaded_image and question: image = Image.open(uploaded_image) st.image(image, caption="Uploaded Image") answer = get_format_response(image,question,selected_model) # Display answer st.write(f"🤔 Model Answer: {answer} 🎉") # Disclaimer st.sidebar.markdown("This is a demo app showcasing VQA models. Actual performance may vary.")