import streamlit as st
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import torch
import os

# Load the Florence model and processor
@st.cache_resource
def load_model():
    model_id = 'microsoft/Florence-2-large'
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto').eval().cuda()
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
    return model, processor

model, processor = load_model()

# Function to run the model
def run_example(task_prompt, image, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.float16)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"].cuda(),
        pixel_values=inputs["pixel_values"].cuda(),
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    return parsed_answer

# Streamlit UI
st.title("Microsoft Florence Image Captioning")

# File uploader
uploaded_file = st.file_uploader("Upload an image (PNG or JPG)", type=["png", "jpg", "jpeg"])

if uploaded_file is not None:
    # Convert and display the image
    image = Image.open(uploaded_file).convert("RGB")
    st.image(image, caption="Uploaded Image", use_column_width=True)

    # Generate captions
    st.subheader("Generated Captions")
    
    with st.spinner("Generating caption..."):
        caption = run_example('<CAPTION>', image)
        detailed_caption = run_example('<DETAILED_CAPTION>', image)
        more_detailed_caption = run_example('<MORE_DETAILED_CAPTION>', image)
    
    st.write("**Caption:**", caption)
    st.write("**Detailed Caption:**", detailed_caption)
    st.write("**More Detailed Caption:**", more_detailed_caption)

    # Option to save the output
    if st.button("Save Captions"):
        output_path = "captions.txt"
        with open(output_path, "w") as file:
            file.write(f"Caption: {caption}\n")
            file.write(f"Detailed Caption: {detailed_caption}\n")
            file.write(f"More Detailed Caption: {more_detailed_caption}\n")
        st.success(f"Captions saved to {output_path}!")