from transformers import AutoProcessor import torch import gradio as gr from PIL import Image from byaldi import RAGMultiModalModel from qwen_vl_utils import process_vision_info import os import tempfile # Load ColPali model RAG = RAGMultiModalModel.from_pretrained("vidore/colpali", device_map="cpu", torch_dtype=torch.float32) processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") def load_model(): return RAG.model vlm = load_model() def ocr_image(image, keyword=""): # Save the image to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file: image.save(temp_file, format='PNG') temp_file_path = temp_file.name try: # Index the image RAG.index(input_path=temp_file_path, index_name="temp_index", overwrite=True) # Retrieve text from the image results = RAG.search("Extract all text from this image", k=1) # Extract text from results output_text = results[0].get('text', '') if keyword: keyword_lower = keyword.lower() if keyword_lower in output_text.lower(): highlighted_text = output_text.replace(keyword, f"**{keyword}**") return f"Keyword '{keyword}' found in the text:\n\n{highlighted_text}" else: return f"Keyword '{keyword}' not found in the text:\n\n{output_text}" else: return output_text finally: # Clean up the temporary file os.unlink(temp_file_path) def process_image(image, keyword=""): max_size = 1024 if max(image.size) > max_size: image.thumbnail((max_size, max_size)) return ocr_image(image, keyword=keyword) interface = gr.Interface( fn=process_image, inputs=[ gr.Image(type="pil"), gr.Textbox(label="Enter keyword to search (optional)") ], outputs="text", title="Hindi & English OCR with Keyword Search", ) interface.launch()