Spaces:

sprakhil
/

OCRQuest-2.0

Runtime error

App Files Files Community

sprakhil commited on Sep 30, 2024

Commit

795b781

1 Parent(s): 45ab2ce

resolving pipeline issue

Browse files

Files changed (1) hide show

app.py +34 -13

app.py CHANGED Viewed

@@ -1,36 +1,45 @@
 import streamlit as st
 from PIL import Image
 import torch
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, pipeline
 from colpali_engine.models import ColPali, ColPaliProcessor
 import os
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 hf_token = os.getenv('HF_TOKEN')
 try:
-    model = pipeline("image-to-text", model="google/paligemma-3b-mix-448", use_auth_token=hf_token)
 except Exception as e:
     st.error(f"Error loading image-to-text model: {e}")
     st.stop()
 try:
-    model_colpali = ColPali.from_pretrained("vidore/colpali-v1.2", torch_dtype=torch.bfloat16).to(device)
-    processor_colpali = ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448")
 except Exception as e:
     st.error(f"Error loading ColPali model or processor: {e}")
     st.stop()
 try:
-    model_qwen = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct").to(device)
-    processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 except Exception as e:
     st.error(f"Error loading Qwen model or processor: {e}")
     st.stop()
 st.title("OCR and Document Search Web Application")
 st.write("Upload an image containing text in both Hindi and English for OCR processing and keyword search.")
 uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
@@ -39,21 +48,33 @@ if uploaded_file is not None:
         st.image(image, caption='Uploaded Image.', use_column_width=True)
         st.write("")
         conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
         text_prompt = processor_qwen.apply_chat_template(conversation, add_generation_prompt=True)
-        inputs = processor_qwen(text=[text_prompt], images=[image], padding=True, return_tensors="pt").to(device)
         with torch.no_grad():
-            output_ids = model_qwen.generate(**inputs, max_new_tokens=128)
-            generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
-            output_text = processor_qwen.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        st.write("Extracted Text:")
-        st.write(output_text)
         keyword = st.text_input("Enter a keyword to search in the extracted text:")
         if keyword:
-            if keyword.lower() in output_text[0].lower():
                 st.write(f"Keyword '{keyword}' found in the text.")
             else:
                 st.write(f"Keyword '{keyword}' not found in the text.")

 import streamlit as st
 from PIL import Image
 import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageToText
 from colpali_engine.models import ColPali, ColPaliProcessor
 import os
+# Set device for computation
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Get Hugging Face token from environment variables
 hf_token = os.getenv('HF_TOKEN')
+# Load the processor and image-to-text model directly using Hugging Face token
 try:
+    processor_img_to_text = AutoProcessor.from_pretrained("google/paligemma-3b-mix-448", use_auth_token=hf_token)
+    model_img_to_text = AutoModelForImageToText.from_pretrained("google/paligemma-3b-mix-448", use_auth_token=hf_token).to(device)
 except Exception as e:
     st.error(f"Error loading image-to-text model: {e}")
     st.stop()
+# Load ColPali model with Hugging Face token
 try:
+    model_colpali = ColPali.from_pretrained("vidore/colpali-v1.2", torch_dtype=torch.bfloat16, use_auth_token=hf_token).to(device)
+    processor_colpali = ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448", use_auth_token=hf_token)
 except Exception as e:
     st.error(f"Error loading ColPali model or processor: {e}")
     st.stop()
+# Load Qwen model with Hugging Face token
 try:
+    model_qwen = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", use_auth_token=hf_token).to(device)
+    processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", use_auth_token=hf_token)
 except Exception as e:
     st.error(f"Error loading Qwen model or processor: {e}")
     st.stop()
+# Streamlit UI
 st.title("OCR and Document Search Web Application")
 st.write("Upload an image containing text in both Hindi and English for OCR processing and keyword search.")
+# File uploader for the image
 uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
         st.image(image, caption='Uploaded Image.', use_column_width=True)
         st.write("")
+        # Use the image-to-text model to extract text from the image
+        inputs_img_to_text = processor_img_to_text(images=image, return_tensors="pt").to(device)
+        with torch.no_grad():
+            generated_ids_img_to_text = model_img_to_text.generate(**inputs_img_to_text, max_new_tokens=128)
+            output_text_img_to_text = processor_img_to_text.batch_decode(generated_ids_img_to_text, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        st.write("Extracted Text from Image:")
+        st.write(output_text_img_to_text)
+        # Prepare input for Qwen model for image description
         conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
         text_prompt = processor_qwen.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs_qwen = processor_qwen(text=[text_prompt], images=[image], padding=True, return_tensors="pt").to(device)
+        # Generate response with Qwen model
         with torch.no_grad():
+            output_ids_qwen = model_qwen.generate(**inputs_qwen, max_new_tokens=128)
+            generated_ids_qwen = [output_ids_qwen[len(input_ids):] for input_ids, output_ids_qwen in zip(inputs_qwen.input_ids, output_ids_qwen)]
+            output_text_qwen = processor_qwen.batch_decode(generated_ids_qwen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        st.write("Qwen Model Description:")
+        st.write(output_text_qwen)
+        # Keyword search in the extracted text
         keyword = st.text_input("Enter a keyword to search in the extracted text:")
         if keyword:
+            if keyword.lower() in output_text_img_to_text[0].lower():
                 st.write(f"Keyword '{keyword}' found in the text.")
             else:
                 st.write(f"Keyword '{keyword}' not found in the text.")