ImageDataExtractor2

Runtime error

App Files Files Community

WebashalarForML commited on Sep 26, 2024

Commit

6fac153

verified ·

1 Parent(s): 567a60e

Update utility/utils.py

Browse files

Files changed (1) hide show

utility/utils.py +16 -6

utility/utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ from dotenv import load_dotenv
 import json
 import re
 import easyocr
 from PIL import Image, ImageEnhance, ImageDraw
 import cv2
 import numpy as np
@@ -22,6 +23,9 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
 # Initialize EasyOCR reader for extracting text
 reader = easyocr.Reader(['en'])
 def draw_boxes(image, bounds, color='red', width=2):
     draw = ImageDraw.Draw(image)
     for bound in bounds:
@@ -95,11 +99,12 @@ def ocr_with_paddle(img):
     return finaltext
-def extract_text_from_images(image_paths):
     all_extracted_texts = {}
     all_extracted_imgs={}
     for image_path in image_paths:
         # Enhance the image before OCR
         enhanced_image = process_image(image_path, scale=2)
         bounds = reader.readtext(enhanced_image)
             # Draw boxes on the processed image
@@ -111,15 +116,18 @@ def extract_text_from_images(image_paths):
         # Perform OCR on the enhanced image
         result=ocr_with_paddle(enhanced_image)
         all_extracted_texts[image_path] =result
         all_extracted_imgs[image_path] = result_image_path
      # Convert to JSON-compatible structure
     all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
     return all_extracted_texts,all_extracted_imgs_json
 # Function to call the Gemma model and process the output as Json
-def Data_Extractor(data, client):
     text = f'''Act as a  Text extractor for the following text given in text: {data}
     extract text in the following output JSON string:
     {{
@@ -134,7 +142,7 @@ def Data_Extractor(data, client):
     Output:
     '''
     # Call the API for inference
-    response = client.text_generation(text, max_new_tokens=600)
     print("parse in text ---:",response)
@@ -280,9 +288,10 @@ def extract_contact_details(text):
     # Email regex
     email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
-    # Links regex, updated to avoid conflicts with email domains
     link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
     # Find all matches in the text
     phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
     print("phone_numbers--->",phone_numbers)
@@ -320,6 +329,7 @@ def process_extracted_text(extracted_text):
         combined_results["links_RE"].extend(contact_details["links_RE"])
     # Convert the combined results to JSON
     combined_results_json = combined_results
     # Print the final JSON results

 import json
 import re
 import easyocr
+import spacy
 from PIL import Image, ImageEnhance, ImageDraw
 import cv2
 import numpy as np
 # Initialize EasyOCR reader for extracting text
 reader = easyocr.Reader(['en'])
+# Initialize spaCy's English model
+nlp = spacy.load("en_core_web_sm")
 def draw_boxes(image, bounds, color='red', width=2):
     draw = ImageDraw.Draw(image)
     for bound in bounds:
     return finaltext
+def extract_text_from_images(image_paths, RESULT_FOLDER):
     all_extracted_texts = {}
     all_extracted_imgs={}
     for image_path in image_paths:
         # Enhance the image before OCR
+        #enhanced_image = load_image(image_path)
         enhanced_image = process_image(image_path, scale=2)
         bounds = reader.readtext(enhanced_image)
             # Draw boxes on the processed image
         # Perform OCR on the enhanced image
         result=ocr_with_paddle(enhanced_image)
+        # results = reader.readtext(enhanced_image)
+        # extracted_text = " ".join([res[1] for res in results])
         all_extracted_texts[image_path] =result
         all_extracted_imgs[image_path] = result_image_path
      # Convert to JSON-compatible structure
     all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
     return all_extracted_texts,all_extracted_imgs_json
 # Function to call the Gemma model and process the output as Json
+def Data_Extractor(data, client=client):
     text = f'''Act as a  Text extractor for the following text given in text: {data}
     extract text in the following output JSON string:
     {{
     Output:
     '''
     # Call the API for inference
+    response = client.text_generation(text, max_new_tokens=600)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
     print("parse in text ---:",response)
     # Email regex
     email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
+    # Profile links regex, updated to avoid conflicts with email domains
+    #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
+    #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
     link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
     # Find all matches in the text
     phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
     print("phone_numbers--->",phone_numbers)
         combined_results["links_RE"].extend(contact_details["links_RE"])
     # Convert the combined results to JSON
+    #combined_results_json = json.dumps(combined_results, indent=4)
     combined_results_json = combined_results
     # Print the final JSON results