File size: 4,418 Bytes
6d4955e
639e3fa
 
1512254
639e3fa
96c0816
639e3fa
 
795b781
639e3fa
 
795b781
32e3531
795b781
96c0816
 
 
1512254
32e3531
1512254
32e3531
6d4955e
639e3fa
795b781
639e3fa
96c0816
 
639e3fa
6d4955e
639e3fa
96c0816
639e3fa
96c0816
 
639e3fa
6d4955e
639e3fa
6d4955e
 
639e3fa
1bd5d8d
 
 
1512254
 
795b781
1bd5d8d
 
 
795b781
639e3fa
 
795b781
639e3fa
795b781
639e3fa
795b781
 
 
639e3fa
1bd5d8d
 
 
6d4955e
639e3fa
795b781
6d4955e
639e3fa
6d4955e
 
639e3fa
6d4955e
639e3fa
6d4955e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639e3fa
6d4955e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from PIL import Image
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, pipeline
from colpali_engine.models import ColPali, ColPaliProcessor
from huggingface_hub import login
import os

# Set device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get Hugging Face token from environment variables
hf_token = os.getenv('HF_TOKEN')

# Log in to Hugging Face Hub (this will authenticate globally)
login(token=hf_token)

# Use pipeline for image-to-text task
try:
    image_to_text_pipeline = pipeline("image-to-text", model="google/paligemma-3b-mix-448", device=0 if torch.cuda.is_available() else -1)
except Exception as e:
    raise Exception(f"Error loading image-to-text model: {e}")

# Load ColPali model with Hugging Face token
try:
    model_colpali = ColPali.from_pretrained("vidore/colpali-v1.2", torch_dtype=torch.bfloat16).to(device)
    processor_colpali = ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448")
except Exception as e:
    raise Exception(f"Error loading ColPali model or processor: {e}")

# Load Qwen model
try:
    model_qwen = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct").to(device)
    processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
except Exception as e:
    raise Exception(f"Error loading Qwen model or processor: {e}")

# Function to process the image and extract text
def process_image(image, keyword):
    try:
        # Debugging: Check the type of the input image
        print(f"Received image of type: {type(image)}")

        # Use the image-to-text pipeline to extract text from the image
        output_text_img_to_text = image_to_text_pipeline(image)

        # Debugging: Check the output of the image-to-text model
        print(f"Output from image-to-text pipeline: {output_text_img_to_text}")

        # Prepare input for Qwen model for image description
        conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
        text_prompt = processor_qwen.apply_chat_template(conversation, add_generation_prompt=True)
        inputs_qwen = processor_qwen(text=[text_prompt], images=[image], padding=True, return_tensors="pt").to(device)

        # Generate response with Qwen model
        with torch.no_grad():
            output_ids_qwen = model_qwen.generate(**inputs_qwen, max_new_tokens=128)
            generated_ids_qwen = [output_ids_qwen[len(input_ids):] for input_ids, output_ids_qwen in zip(inputs_qwen.input_ids, output_ids_qwen)]
            output_text_qwen = processor_qwen.batch_decode(generated_ids_qwen, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        # Debugging: Check the output from the Qwen model
        print(f"Output from Qwen model: {output_text_qwen}")

        extracted_text = output_text_img_to_text[0]['generated_text']

        # Keyword search in the extracted text
        keyword_found = ""
        if keyword:
            if keyword.lower() in extracted_text.lower():
                keyword_found = f"Keyword '{keyword}' found in the text."
            else:
                keyword_found = f"Keyword '{keyword}' not found in the text."

        return extracted_text, output_text_qwen[0], keyword_found
    except Exception as e:
        return str(e), "", ""

# Define Gradio Interface
title = "OCR and Document Search Web Application"
description = "Upload an image containing text in both Hindi and English for OCR processing and keyword search."

# Gradio interface for input and output
image_input = gr.inputs.Image(type="pil")
keyword_input = gr.inputs.Textbox(label="Enter a keyword to search in the extracted text (Optional)")
output_textbox = gr.outputs.Textbox(label="Extracted Text")
output_description = gr.outputs.Textbox(label="Qwen Model Description")
output_keyword_search = gr.outputs.Textbox(label="Keyword Search Result")

# Set up Gradio interface layout
interface = gr.Interface(
    fn=process_image,  # Function to call when button is pressed
    inputs=[image_input, keyword_input],  # Input types (image and keyword)
    outputs=[output_textbox, output_description, output_keyword_search],  # Outputs (text boxes for results)
    title=title,
    description=description
)

# Launch the Gradio app
if __name__ == "__main__":
    interface.launch(share=True)