Spaces:
Running
Running
from transformers import AutoProcessor | |
import torch | |
import gradio as gr | |
from PIL import Image | |
from byaldi import RAGMultiModalModel | |
from qwen_vl_utils import process_vision_info | |
import os | |
import tempfile | |
# Load ColPali model | |
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali", device_map="cpu", torch_dtype=torch.float32) | |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") | |
def load_model(): | |
return RAG.model | |
vlm = load_model() | |
def ocr_image(image, keyword=""): | |
# Save the image to a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file: | |
image.save(temp_file, format='PNG') | |
temp_file_path = temp_file.name | |
try: | |
# Index the image | |
RAG.index(input_path=temp_file_path, index_name="temp_index", overwrite=True) | |
# Retrieve text from the image | |
results = RAG.search("Extract all text from this image", k=1) | |
# Extract text from results | |
output_text = results[0].get('text', '') | |
if keyword: | |
keyword_lower = keyword.lower() | |
if keyword_lower in output_text.lower(): | |
highlighted_text = output_text.replace(keyword, f"**{keyword}**") | |
return f"Keyword '{keyword}' found in the text:\n\n{highlighted_text}" | |
else: | |
return f"Keyword '{keyword}' not found in the text:\n\n{output_text}" | |
else: | |
return output_text | |
finally: | |
# Clean up the temporary file | |
os.unlink(temp_file_path) | |
def process_image(image, keyword=""): | |
max_size = 1024 | |
if max(image.size) > max_size: | |
image.thumbnail((max_size, max_size)) | |
return ocr_image(image, keyword=keyword) | |
interface = gr.Interface( | |
fn=process_image, | |
inputs=[ | |
gr.Image(type="pil"), | |
gr.Textbox(label="Enter keyword to search (optional)") | |
], | |
outputs="text", | |
title="Hindi & English OCR with Keyword Search", | |
) | |
interface.launch() |