Spaces:

Sana1207
/

OCR_app

Runtime error

App Files Files Community

Sana1207 commited on Sep 29

Commit

e38a780

•

1 Parent(s): 5522918

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -92

app.py CHANGED Viewed

@@ -1,100 +1,119 @@
 import streamlit as st
-import base64
-from byaldi import RAGMultiModalModel
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from PIL import Image
-from io import BytesIO
 import torch
 import re
 @st.cache_resource
-def load_models():
-    RAG = RAGMultiModalModel.from_pretrained("vidore/colpali", verbose=10)
-    model = Qwen2VLForConditionalGeneration.from_pretrained(
-        "Qwen/Qwen2-VL-2B-Instruct",
-        torch_dtype=torch.float16,
-        device_map="auto",
-    )
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
-    return RAG, model, processor
-RAG, model, processor = load_models()
-st.title("Multimodal Image Search and Text Extraction App")
-uploaded_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
-if uploaded_file is not None:
-    image = Image.open(uploaded_file)
-    st.image(image, caption='Uploaded Image', use_column_width=True)
-    temp_image_path = "uploaded_image.jpeg"
-    image.save(temp_image_path)
-    @st.cache_data
-    def create_rag_index(image_path):
-        RAG.index(
-            input_path=image_path,
-            index_name="image_index",
-            store_collection_with_index=True,
-            overwrite=True,
-        )
-    create_rag_index(temp_image_path)
-    text_query = st.text_input("Enter your text query")
-    if st.button("Search and Extract Text"):
-        if text_query:
-            results = RAG.search(text_query, k=1, return_base64_results=True)
-            image_data = base64.b64decode(results[0].base64)
-            image = Image.open(BytesIO(image_data))
-            st.image(image, caption="Result Image", use_column_width=True)
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image"},
-                        {"type": "text", "text": "Run OCR on the image"}
-                    ]
-                }
-            ]
-            text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-            inputs = processor(
-                text=[text_prompt],
-                images=[image],
-                padding=True,
-                return_tensors="pt"
-            )
-            inputs = inputs.to(model.device)
-            with torch.no_grad():
-                output_ids = model.generate(**inputs, max_new_tokens=1024)
-            generated_ids = output_ids[:, inputs.input_ids.shape[1]:]
-            output_text = processor.batch_decode(
-                generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-            )[0]
-            # Highlight the queried text
-            def highlight_text(text, query):
-                highlighted_text = text
-                for word in query.split():
-                    pattern = re.compile(re.escape(word), re.IGNORECASE)
-                    highlighted_text = pattern.sub(lambda m: f'<span style="background-color: yellow;">{m.group()}</span>', highlighted_text)
-                return highlighted_text
-            highlighted_output = highlight_text(output_text, text_query)
-            st.subheader("Extracted Text (with query highlighted):")
-            st.markdown(highlighted_output, unsafe_allow_html=True)
         else:
-            st.warning("Please enter a query.")
-else:
-    st.info("Upload an image to get started.")

+from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor
 import streamlit as st
+import os
 from PIL import Image
+import requests
 import torch
+import json
+from torchvision import io
+from typing import Dict
 import re
 @st.cache_resource
+def init_model():
+    tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
+    model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
+    model = model.eval()
+    return model, tokenizer
+def init_gpu_model():
+    tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
+    model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
+    model = model.eval().cuda()
+    return model, tokenizer
+def init_qwen_model():
+    model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="cpu", torch_dtype=torch.float16)
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+    return model, processor
+def get_quen_op(image_file, model, processor):
+    try:
+        image = Image.open(image_file).convert('RGB')
+        conversation = [
+            {
+                "role":"user",
+                "content":[
+                    {
+                        "type":"image",
+                    },
+                    {
+                        "type":"text",
+                        "text":"Extract text from this image."
+                    }
+                ]
+            }
+        ]
+        text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
+        inputs = {k: v.to(torch.float32) if torch.is_floating_point(v) else v for k, v in inputs.items()}
+        generation_config = {
+            "max_new_tokens": 32,
+            "do_sample": False,
+            "top_k": 20,
+            "top_p": 0.90,
+            "temperature": 0.4,
+            "num_return_sequences": 1,
+            "pad_token_id": processor.tokenizer.pad_token_id,
+            "eos_token_id": processor.tokenizer.eos_token_id,
+        }
+        output_ids = model.generate(**inputs, **generation_config)
+        if 'input_ids' in inputs:
+                generated_ids = output_ids[:, inputs['input_ids'].shape[1]:]
         else:
+            generated_ids = output_ids
+        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        return output_text[:] if output_text else "No text extracted from the image."
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+@st.cache_data
+def get_text(image_file, _model, _tokenizer):
+    res = _model.chat(_tokenizer, image_file, ocr_type='ocr')
+    return res
+def highlight_text(text, search_term):
+    if not search_term:
+        return text
+    pattern = re.compile(re.escape(search_term), re.IGNORECASE)
+    return pattern.sub(lambda m: f'<span style="background-color: grey;">{m.group()}</span>', text)
+def save_text_to_json(file_name, text_data):
+    """Save the extracted text into a JSON file."""
+    with open(file_name, 'w') as json_file:
+        json.dump({"extracted_text": text_data}, json_file, indent=4)
+    st.success(f"Text saved to {file_name}")
+st.title("Extract text from the image using  - GOT-OCR2.0 and search keyword")
+st.write("Upload an image")
+MODEL, PROCESSOR = init_model()
+image_file = st.file_uploader("Upload Image", type=['jpg', 'png', 'jpeg'])
+if image_file:
+    if not os.path.exists("images"):
+        os.makedirs("images")
+    with open(f"images/{image_file.name}", "wb") as f:
+        f.write(image_file.getbuffer())
+    image_file = f"images/{image_file.name}"
+    text = get_text(image_file, MODEL, PROCESSOR)
+    print(text)
+    # Add search functionality
+    search_term = st.text_input("Enter a word or phrase to search:")
+    highlighted_text = highlight_text(text, search_term)
+    st.markdown(highlighted_text, unsafe_allow_html=True)
+    # Save the extracted text in JSON
+    json_file_path = f"{image_file}_extracted.json"
+    save_text_to_json(json_file_path, text)