ImageDataExtractor2

Sleeping

App Files Files Community

WebashalarForML commited on Sep 30, 2024

Commit

8bd631d

verified ·

1 Parent(s): 4e653b1

Update utility/utils.py

Browse files

Files changed (1) hide show

utility/utils.py +30 -31

utility/utils.py CHANGED Viewed

@@ -43,19 +43,6 @@ HFT = os.getenv('HF_TOKEN')
 # Initialize the InferenceClient
 client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
-# Specify a custom model storage directory (ensure this path is writable)
-#model_storage_directory = '/app/models'
-# Create the reader object and set the model storage directory
-#reader = easyocr.Reader(['en'], model_storage_directory=model_storage_directory)
-def draw_boxes(image, bounds, color='red', width=2):
-    draw = ImageDraw.Draw(image)
-    for bound in bounds:
-        p0, p1, p2, p3 = bound[0]
-        draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
-    return image
 # Load image using OpenCV
 def load_image(image_path):
     image = cv2.imread(image_path)
@@ -108,11 +95,10 @@ def process_image(image_path, scale=2):
     return final_image
 def ocr_with_paddle(img):
-    finaltext = ''
-    #model_dir = os.getenv('PADDLEOCR_MODEL_DIR', '/tmp/.paddleocr')
-    #ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=model_dir)
-    #ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=os.environ['PADDLEOCR_HOME'])
     logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
     ocr = PaddleOCR(
         lang='en',
@@ -123,12 +109,22 @@ def ocr_with_paddle(img):
     )
     result = ocr.ocr(img)
-    for i in range(len(result[0])):
-        text = result[0][i][1][0]
-        finaltext += ' ' + text
-    return finaltext
 def extract_text_from_images(image_paths):
     all_extracted_texts = {}
     all_extracted_imgs = {}
@@ -137,15 +133,18 @@ def extract_text_from_images(image_paths):
             # Enhance the image before OCR
             enhanced_image = process_image(image_path, scale=2)
-            # Draw boxes on the processed image (optional, requires bounds)
             img_result = Image.fromarray(enhanced_image)
             result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
-            img_result.save(result_image_path)  # Save the processed image
-            # Perform OCR on the enhanced image
-            result = ocr_with_paddle(enhanced_image)
             all_extracted_texts[image_path] = result
             all_extracted_imgs[image_path] = result_image_path
         except ValueError as ve:
@@ -318,14 +317,14 @@ def extract_contact_details(text):
     # Email regex
     email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
-    # Profile links regex, updated to avoid conflicts with email domains
-    #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
-    #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
     link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
     # Find all matches in the text
     phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
-    print("phone_numbers--->",phone_numbers)
     emails = email_regex.findall(text)
     links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
     # Remove profile links that might conflict with emails
@@ -385,4 +384,4 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
     processed_data['email'].extend(cont_data.get("emails", []))
     processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
     processed_data['Link'].extend(cont_data.get("links_RE", []))
-    return processed_data

 # Initialize the InferenceClient
 client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
 # Load image using OpenCV
 def load_image(image_path):
     image = cv2.imread(image_path)
     return final_image
+# Function for OCR with PaddleOCR, returning both text and bounding boxes
 def ocr_with_paddle(img):
+    final_text = ''
     logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
     ocr = PaddleOCR(
         lang='en',
     )
     result = ocr.ocr(img)
+    boxes = []
+    for line in result[0]:
+        box, text, _ = line
+        boxes.append(box)  # Append the bounding box
+        final_text += ' ' + text
+    return final_text, boxes
+# Function to draw bounding boxes around text
+def draw_boxes(image, boxes):
+    draw = ImageDraw.Draw(image)
+    for box in boxes:
+        draw.polygon(box, outline="red", width=3)
+    return image
+# Extract text and create a result image with bounding boxes
 def extract_text_from_images(image_paths):
     all_extracted_texts = {}
     all_extracted_imgs = {}
             # Enhance the image before OCR
             enhanced_image = process_image(image_path, scale=2)
+            # Perform OCR on the enhanced image and get boxes
+            result, boxes = ocr_with_paddle(enhanced_image)
+            # Draw bounding boxes on the processed image
             img_result = Image.fromarray(enhanced_image)
+            img_with_boxes = draw_boxes(img_result, boxes)
+            # Save the image with boxes
             result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
+            img_with_boxes.save(result_image_path)
+            # Store the text and image result paths
             all_extracted_texts[image_path] = result
             all_extracted_imgs[image_path] = result_image_path
         except ValueError as ve:
     # Email regex
     email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
+    # URL and links regex, updated to avoid conflicts with email domains
     link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
     # Find all matches in the text
     phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
     emails = email_regex.findall(text)
     links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
     # Remove profile links that might conflict with emails
     processed_data['email'].extend(cont_data.get("emails", []))
     processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
     processed_data['Link'].extend(cont_data.get("links_RE", []))
+    return processed_data