from transformers import AutoProcessor
import torch
import gradio as gr
from PIL import Image
from byaldi import RAGMultiModalModel
from qwen_vl_utils import process_vision_info
import os
import tempfile

# Load ColPali model
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali", device_map="cpu", torch_dtype=torch.float32)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def load_model():
    return RAG.model

vlm = load_model()

def ocr_image(image, keyword=""):
    # Save the image to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
        image.save(temp_file, format='PNG')
        temp_file_path = temp_file.name

    try:
        # Index the image
        RAG.index(input_path=temp_file_path, index_name="temp_index", overwrite=True)

        # Retrieve text from the image
        results = RAG.search("Extract all text from this image", k=1)
        
        # Extract text from results
        output_text = results[0].get('text', '')

        if keyword:
            keyword_lower = keyword.lower()
            if keyword_lower in output_text.lower():
                highlighted_text = output_text.replace(keyword, f"**{keyword}**")
                return f"Keyword '{keyword}' found in the text:\n\n{highlighted_text}"
            else:
                return f"Keyword '{keyword}' not found in the text:\n\n{output_text}"
        else:
            return output_text
    finally:
        # Clean up the temporary file
        os.unlink(temp_file_path)

def process_image(image, keyword=""):
    max_size = 1024
    if max(image.size) > max_size:
        image.thumbnail((max_size, max_size))
    return ocr_image(image, keyword=keyword)

interface = gr.Interface(
    fn=process_image,
    inputs=[
        gr.Image(type="pil"),
        gr.Textbox(label="Enter keyword to search (optional)")
    ],
    outputs="text",
    title="Hindi & English OCR with Keyword Search",
)

interface.launch()