Spaces:

DeF0017
/

OCR-using-Qwen2-VL

Running

App Files Files Community

DeF0017 commited on Sep 29

Commit

65f5126

•

1 Parent(s): d0b78ba

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -175

app.py CHANGED Viewed

@@ -1,175 +1,139 @@
-import gradio as gr
-import spaces
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
-from qwen_vl_utils import process_vision_info
-import torch
-from PIL import Image
-import subprocess
-import numpy as np
-import os
-from threading import Thread
-import uuid
-import io
-import re  # Import regular expressions for word highlighting
-# Model and Processor Loading (Done once at startup)
-MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-DESCRIPTION = "[Qwen2-VL-2B Demo](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)"
-# Define supported media extensions
-image_extensions = Image.registered_extensions()
-video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
-def identify_and_save_blob(blob_path):
-    """Identifies if the blob is an image or video and saves it accordingly."""
-    try:
-        with open(blob_path, 'rb') as file:
-            blob_content = file.read()
-            # Try to identify if it's an image
-            try:
-                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image
-                extension = ".png"  # Default to PNG for saving
-                media_type = "image"
-            except (IOError, SyntaxError):
-                # If it's not a valid image, assume it's a video
-                extension = ".mp4"  # Default to MP4 for saving
-                media_type = "video"
-            # Create a unique filename
-            filename = f"temp_{uuid.uuid4()}_media{extension}"
-            with open(filename, "wb") as f:
-                f.write(blob_content)
-            return filename, media_type
-    except FileNotFoundError:
-        raise ValueError(f"The file {blob_path} was not found.")
-    except Exception as e:
-        raise ValueError(f"An error occurred while processing the file: {e}")
-@spaces.GPU
-def qwen_inference(media_input, search_word):
-    """
-    Performs OCR on the input media and highlights the search_word in the extracted text.
-    Args:
-        media_input (str): Path to the uploaded image or video file.
-        search_word (str): The word to search and highlight in the OCR result.
-    Yields:
-        str: The OCR result with highlighted search words.
-    """
-    text_input = "Extract text"  # Hardcoded text query
-    if isinstance(media_input, str):  # If it's a filepath
-        media_path = media_input
-        if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
-            media_type = "image"
-        elif media_path.endswith(video_extensions):
-            media_type = "video"
-        else:
-            try:
-                media_path, media_type = identify_and_save_blob(media_input)
-                print(media_path, media_type)
-            except Exception as e:
-                print(e)
-                raise ValueError(
-                    "Unsupported media type. Please upload an image or video."
-                )
-    print(f"Processing media: {media_path} (Type: {media_type})")
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": media_type,
-                    media_type: media_path,
-                    **({"fps": 8.0} if media_type == "video" else {}),
-                },
-                {"type": "text", "text": text_input},
-            ],
-        }
-    ]
-    # Apply chat template to format the input for the model
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    # Prepare model inputs
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    ).to("cuda")
-    # Initialize the streamer for iterative generation
-    streamer = TextIteratorStreamer(
-        processor, skip_prompt=True, **{"skip_special_tokens": True}
-    )
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    # Start the generation in a separate thread
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        # Highlight the search_word in the buffer
-        if search_word:
-            # Use regex for case-insensitive search and highlight
-            pattern = re.compile(re.escape(search_word), re.IGNORECASE)
-            highlighted_text = pattern.sub(lambda m: f"<mark>{m.group(0)}</mark>", buffer)
-        else:
-            highlighted_text = buffer
-        yield highlighted_text
-css = """
-  #output {
-    height: 500px;
-    overflow: auto;
-    border: 1px solid #ccc;
-  }
-"""
-with gr.Blocks(css=css) as demo:
-    gr.Markdown(DESCRIPTION)
-    with gr.Tab(label="Image/Video Input"):
-        with gr.Row():
-            with gr.Column():
-                input_media = gr.File(
-                    label="Upload Image or Video", type="filepath"
-                )
-                search_word = gr.Textbox(
-                    label="Search Word", placeholder="Enter word to highlight", lines=1
-                )
-                submit_btn = gr.Button(value="Submit")
-            with gr.Column():
-                # Use HTML component to display highlighted text
-                output_text = gr.HTML(label="Output Text")
-        submit_btn.click(
-            qwen_inference,
-            inputs=[input_media, search_word],
-            outputs=[output_text]
-        )
-demo.launch(debug=True)

+from byaldi import RAGMultiModalModel
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import gradio as gr
+from PIL import Image
+import re
+# Load models
+def initialize_models():
+    """Loads and returns the RAG multimodal and Qwen2-VL models along with the processor."""
+    multimodal_rag = RAGMultiModalModel.from_pretrained("vidore/colpali")
+    qwen_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True, torch_dtype=torch.float32)
+    qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
+    return multimodal_rag, qwen_model, qwen_processor
+multimodal_rag, qwen_model, qwen_processor = initialize_models()
+# Text extraction function
+def perform_ocr(image):
+    """Extracts Sanskrit and English text from an image using the Qwen model."""
+    query = "Extract text from the image in original language"
+    # Format the request for the model
+    user_input = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": query}
+            ]
+        }
+    ]
+    # Preprocess the input
+    input_text = qwen_processor.apply_chat_template(user_input, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(user_input)
+    model_inputs = qwen_processor(
+        text=[input_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
+    ).to("cpu")  # Use CPU for inference
+    # Generate output
+    with torch.no_grad():
+        generated_ids = qwen_model.generate(**model_inputs, max_new_tokens=2000)
+        trimmed_ids = [output[len(input_ids):] for input_ids, output in zip(model_inputs.input_ids, generated_ids)]
+        ocr_result = qwen_processor.batch_decode(trimmed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    return ocr_result
+# Keyword search function
+def highlight_keyword(text, keyword):
+    """Searches and highlights the keyword in the extracted text."""
+    keyword_lowercase = keyword.lower()
+    sentences = text.split('. ')
+    results = []
+    for sentence in sentences:
+        if keyword_lowercase in sentence.lower():
+            highlighted = re.sub(f'({re.escape(keyword)})', r'<mark>\1</mark>', sentence, flags=re.IGNORECASE)
+            results.append(highlighted)
+    return results if results else ["No matches found."]
+# Gradio app for text extraction
+def extract_text(image):
+    """Extracts text from an uploaded image."""
+    return perform_ocr(image)
+# Gradio app for keyword search
+def search_in_text(extracted_text, keyword):
+    """Searches for a keyword in the extracted text and highlights matches."""
+    results = highlight_keyword(extracted_text, keyword)
+    return "<br>".join(results)
+# Updated title with revised phrasing
+header_html = """
+<h1 style="text-align: center; color: #4CAF50;"><span class="gradient-text">OCR and Text Search Prototype</span></h1>
+"""
+# CSS to fix button sizes
+custom_css = """
+    .gr-button {
+        width: 200px; /* Set a fixed width for the buttons */
+        padding: 12px 20px; /* Add padding to buttons for consistency */
+    }
+    .gr-textbox {
+        max-height: 300px; /* Set a maximum height for the extracted text output */
+        overflow-y: scroll; /* Enable scrolling when text exceeds the height */
+    }
+"""
+# Gradio Interface
+with gr.Blocks(css=custom_css) as interface:
+    # Header section
+    gr.HTML(header_html)
+    # Sidebar section
+    with gr.Row():
+        with gr.Column(scale=1, min_width=200):
+            gr.Markdown("## Instructions")
+            gr.Markdown("""
+                1. Upload an image containing text.
+                2. Extract the text from the image.
+                3. Search for specific keywords within the extracted text.
+            """)
+            gr.Markdown("### Features")
+            gr.Markdown("""
+                - **OCR**: Extract text from images.
+                - **Keyword Search**: Search and highlight keywords in extracted text.
+            """)
+        with gr.Column(scale=3):
+            # Main content in tabs
+            with gr.Tabs():
+                # First Tab: Text Extraction
+                with gr.Tab("Extract Text"):
+                    gr.Markdown("### Upload an image to extract text:")
+                    with gr.Row():
+                        image_upload = gr.Image(type="pil", label="Upload Image", interactive=True)
+                    with gr.Row():
+                        extract_btn = gr.Button("Extract Text")
+                        extracted_textbox = gr.Textbox(label="Extracted Text")
+                    extract_btn.click(extract_text, inputs=image_upload, outputs=extracted_textbox)
+                # Second Tab: Keyword Search
+                with gr.Tab("Search in Extracted Text"):
+                    gr.Markdown("### Search for a keyword in the extracted text:")
+                    with gr.Row():
+                        keyword_searchbox = gr.Textbox(label="Enter Keyword", placeholder="Keyword to search")
+                    with gr.Row():
+                        search_btn = gr.Button("Search")
+                        search_results = gr.HTML(label="Results")
+                    search_btn.click(search_in_text, inputs=[extracted_textbox, keyword_searchbox], outputs=search_results)
+# Launch the Gradio App
+interface.launch()