ocr-assignment / app.py
pranshh's picture
Update app.py
bf86837 verified
raw
history blame
1.99 kB
from transformers import AutoProcessor
import torch
import gradio as gr
from PIL import Image
from byaldi import RAGMultiModalModel
from qwen_vl_utils import process_vision_info
import os
import tempfile
# Load ColPali model
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali", device_map="cpu", torch_dtype=torch.float32)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
def load_model():
return RAG.model
vlm = load_model()
def ocr_image(image, keyword=""):
# Save the image to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
image.save(temp_file, format='PNG')
temp_file_path = temp_file.name
try:
# Index the image
RAG.index(input_path=temp_file_path, index_name="temp_index", overwrite=True)
# Retrieve text from the image
results = RAG.search("Extract all text from this image", k=1)
# Extract text from results
output_text = results[0].get('text', '')
if keyword:
keyword_lower = keyword.lower()
if keyword_lower in output_text.lower():
highlighted_text = output_text.replace(keyword, f"**{keyword}**")
return f"Keyword '{keyword}' found in the text:\n\n{highlighted_text}"
else:
return f"Keyword '{keyword}' not found in the text:\n\n{output_text}"
else:
return output_text
finally:
# Clean up the temporary file
os.unlink(temp_file_path)
def process_image(image, keyword=""):
max_size = 1024
if max(image.size) > max_size:
image.thumbnail((max_size, max_size))
return ocr_image(image, keyword=keyword)
interface = gr.Interface(
fn=process_image,
inputs=[
gr.Image(type="pil"),
gr.Textbox(label="Enter keyword to search (optional)")
],
outputs="text",
title="Hindi & English OCR with Keyword Search",
)
interface.launch()