Spaces:
Runtime error
Runtime error
File size: 4,418 Bytes
6d4955e 639e3fa 1512254 639e3fa 96c0816 639e3fa 795b781 639e3fa 795b781 32e3531 795b781 96c0816 1512254 32e3531 1512254 32e3531 6d4955e 639e3fa 795b781 639e3fa 96c0816 639e3fa 6d4955e 639e3fa 96c0816 639e3fa 96c0816 639e3fa 6d4955e 639e3fa 6d4955e 639e3fa 1bd5d8d 1512254 795b781 1bd5d8d 795b781 639e3fa 795b781 639e3fa 795b781 639e3fa 795b781 639e3fa 1bd5d8d 6d4955e 639e3fa 795b781 6d4955e 639e3fa 6d4955e 639e3fa 6d4955e 639e3fa 6d4955e 639e3fa 6d4955e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
from PIL import Image
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, pipeline
from colpali_engine.models import ColPali, ColPaliProcessor
from huggingface_hub import login
import os
# Set device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Get Hugging Face token from environment variables
hf_token = os.getenv('HF_TOKEN')
# Log in to Hugging Face Hub (this will authenticate globally)
login(token=hf_token)
# Use pipeline for image-to-text task
try:
image_to_text_pipeline = pipeline("image-to-text", model="google/paligemma-3b-mix-448", device=0 if torch.cuda.is_available() else -1)
except Exception as e:
raise Exception(f"Error loading image-to-text model: {e}")
# Load ColPali model with Hugging Face token
try:
model_colpali = ColPali.from_pretrained("vidore/colpali-v1.2", torch_dtype=torch.bfloat16).to(device)
processor_colpali = ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448")
except Exception as e:
raise Exception(f"Error loading ColPali model or processor: {e}")
# Load Qwen model
try:
model_qwen = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct").to(device)
processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
except Exception as e:
raise Exception(f"Error loading Qwen model or processor: {e}")
# Function to process the image and extract text
def process_image(image, keyword):
try:
# Debugging: Check the type of the input image
print(f"Received image of type: {type(image)}")
# Use the image-to-text pipeline to extract text from the image
output_text_img_to_text = image_to_text_pipeline(image)
# Debugging: Check the output of the image-to-text model
print(f"Output from image-to-text pipeline: {output_text_img_to_text}")
# Prepare input for Qwen model for image description
conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
text_prompt = processor_qwen.apply_chat_template(conversation, add_generation_prompt=True)
inputs_qwen = processor_qwen(text=[text_prompt], images=[image], padding=True, return_tensors="pt").to(device)
# Generate response with Qwen model
with torch.no_grad():
output_ids_qwen = model_qwen.generate(**inputs_qwen, max_new_tokens=128)
generated_ids_qwen = [output_ids_qwen[len(input_ids):] for input_ids, output_ids_qwen in zip(inputs_qwen.input_ids, output_ids_qwen)]
output_text_qwen = processor_qwen.batch_decode(generated_ids_qwen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
# Debugging: Check the output from the Qwen model
print(f"Output from Qwen model: {output_text_qwen}")
extracted_text = output_text_img_to_text[0]['generated_text']
# Keyword search in the extracted text
keyword_found = ""
if keyword:
if keyword.lower() in extracted_text.lower():
keyword_found = f"Keyword '{keyword}' found in the text."
else:
keyword_found = f"Keyword '{keyword}' not found in the text."
return extracted_text, output_text_qwen[0], keyword_found
except Exception as e:
return str(e), "", ""
# Define Gradio Interface
title = "OCR and Document Search Web Application"
description = "Upload an image containing text in both Hindi and English for OCR processing and keyword search."
# Gradio interface for input and output
image_input = gr.inputs.Image(type="pil")
keyword_input = gr.inputs.Textbox(label="Enter a keyword to search in the extracted text (Optional)")
output_textbox = gr.outputs.Textbox(label="Extracted Text")
output_description = gr.outputs.Textbox(label="Qwen Model Description")
output_keyword_search = gr.outputs.Textbox(label="Keyword Search Result")
# Set up Gradio interface layout
interface = gr.Interface(
fn=process_image, # Function to call when button is pressed
inputs=[image_input, keyword_input], # Input types (image and keyword)
outputs=[output_textbox, output_description, output_keyword_search], # Outputs (text boxes for results)
title=title,
description=description
)
# Launch the Gradio app
if __name__ == "__main__":
interface.launch(share=True)
|