File size: 1,990 Bytes
4528460
d2212a0
28c861d
 
4528460
 
bf86837
 
86c3f4a
4528460
bf86837
28c861d
d2212a0
 
 
4528460
d2212a0
 
 
4528460
bf86837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89c493b
bf86837
 
 
 
d2212a0
89c493b
9936c85
 
 
89c493b
d2212a0
 
 
89c493b
 
 
 
d2212a0
89c493b
28c861d
 
8222a16
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from transformers import AutoProcessor
import torch
import gradio as gr
from PIL import Image
from byaldi import RAGMultiModalModel
from qwen_vl_utils import process_vision_info
import os
import tempfile

# Load ColPali model
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali", device_map="cpu", torch_dtype=torch.float32)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def load_model():
    return RAG.model

vlm = load_model()

def ocr_image(image, keyword=""):
    # Save the image to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
        image.save(temp_file, format='PNG')
        temp_file_path = temp_file.name

    try:
        # Index the image
        RAG.index(input_path=temp_file_path, index_name="temp_index", overwrite=True)

        # Retrieve text from the image
        results = RAG.search("Extract all text from this image", k=1)
        
        # Extract text from results
        output_text = results[0].get('text', '')

        if keyword:
            keyword_lower = keyword.lower()
            if keyword_lower in output_text.lower():
                highlighted_text = output_text.replace(keyword, f"**{keyword}**")
                return f"Keyword '{keyword}' found in the text:\n\n{highlighted_text}"
            else:
                return f"Keyword '{keyword}' not found in the text:\n\n{output_text}"
        else:
            return output_text
    finally:
        # Clean up the temporary file
        os.unlink(temp_file_path)

def process_image(image, keyword=""):
    max_size = 1024
    if max(image.size) > max_size:
        image.thumbnail((max_size, max_size))
    return ocr_image(image, keyword=keyword)

interface = gr.Interface(
    fn=process_image,
    inputs=[
        gr.Image(type="pil"),
        gr.Textbox(label="Enter keyword to search (optional)")
    ],
    outputs="text",
    title="Hindi & English OCR with Keyword Search",
)

interface.launch()