WebashalarForML commited on
Commit
8bd631d
·
verified ·
1 Parent(s): 4e653b1

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +30 -31
utility/utils.py CHANGED
@@ -43,19 +43,6 @@ HFT = os.getenv('HF_TOKEN')
43
  # Initialize the InferenceClient
44
  client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
45
 
46
- # Specify a custom model storage directory (ensure this path is writable)
47
- #model_storage_directory = '/app/models'
48
-
49
- # Create the reader object and set the model storage directory
50
- #reader = easyocr.Reader(['en'], model_storage_directory=model_storage_directory)
51
-
52
- def draw_boxes(image, bounds, color='red', width=2):
53
- draw = ImageDraw.Draw(image)
54
- for bound in bounds:
55
- p0, p1, p2, p3 = bound[0]
56
- draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
57
- return image
58
-
59
  # Load image using OpenCV
60
  def load_image(image_path):
61
  image = cv2.imread(image_path)
@@ -108,11 +95,10 @@ def process_image(image_path, scale=2):
108
 
109
  return final_image
110
 
 
111
  def ocr_with_paddle(img):
112
- finaltext = ''
113
- #model_dir = os.getenv('PADDLEOCR_MODEL_DIR', '/tmp/.paddleocr')
114
- #ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=model_dir)
115
- #ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=os.environ['PADDLEOCR_HOME'])
116
  logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
117
  ocr = PaddleOCR(
118
  lang='en',
@@ -123,12 +109,22 @@ def ocr_with_paddle(img):
123
  )
124
 
125
  result = ocr.ocr(img)
 
 
 
 
 
 
 
126
 
127
- for i in range(len(result[0])):
128
- text = result[0][i][1][0]
129
- finaltext += ' ' + text
130
- return finaltext
 
 
131
 
 
132
  def extract_text_from_images(image_paths):
133
  all_extracted_texts = {}
134
  all_extracted_imgs = {}
@@ -137,15 +133,18 @@ def extract_text_from_images(image_paths):
137
  # Enhance the image before OCR
138
  enhanced_image = process_image(image_path, scale=2)
139
 
140
- # Draw boxes on the processed image (optional, requires bounds)
 
 
 
141
  img_result = Image.fromarray(enhanced_image)
 
142
 
 
143
  result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
144
- img_result.save(result_image_path) # Save the processed image
145
-
146
- # Perform OCR on the enhanced image
147
- result = ocr_with_paddle(enhanced_image)
148
 
 
149
  all_extracted_texts[image_path] = result
150
  all_extracted_imgs[image_path] = result_image_path
151
  except ValueError as ve:
@@ -318,14 +317,14 @@ def extract_contact_details(text):
318
  # Email regex
319
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
320
 
321
- # Profile links regex, updated to avoid conflicts with email domains
322
- #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
323
- #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
324
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
 
325
  # Find all matches in the text
326
  phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
327
- print("phone_numbers--->",phone_numbers)
328
  emails = email_regex.findall(text)
 
329
  links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
330
 
331
  # Remove profile links that might conflict with emails
@@ -385,4 +384,4 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
385
  processed_data['email'].extend(cont_data.get("emails", []))
386
  processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
387
  processed_data['Link'].extend(cont_data.get("links_RE", []))
388
- return processed_data
 
43
  # Initialize the InferenceClient
44
  client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # Load image using OpenCV
47
  def load_image(image_path):
48
  image = cv2.imread(image_path)
 
95
 
96
  return final_image
97
 
98
+ # Function for OCR with PaddleOCR, returning both text and bounding boxes
99
  def ocr_with_paddle(img):
100
+ final_text = ''
101
+
 
 
102
  logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
103
  ocr = PaddleOCR(
104
  lang='en',
 
109
  )
110
 
111
  result = ocr.ocr(img)
112
+ boxes = []
113
+ for line in result[0]:
114
+ box, text, _ = line
115
+ boxes.append(box) # Append the bounding box
116
+ final_text += ' ' + text
117
+
118
+ return final_text, boxes
119
 
120
+ # Function to draw bounding boxes around text
121
+ def draw_boxes(image, boxes):
122
+ draw = ImageDraw.Draw(image)
123
+ for box in boxes:
124
+ draw.polygon(box, outline="red", width=3)
125
+ return image
126
 
127
+ # Extract text and create a result image with bounding boxes
128
  def extract_text_from_images(image_paths):
129
  all_extracted_texts = {}
130
  all_extracted_imgs = {}
 
133
  # Enhance the image before OCR
134
  enhanced_image = process_image(image_path, scale=2)
135
 
136
+ # Perform OCR on the enhanced image and get boxes
137
+ result, boxes = ocr_with_paddle(enhanced_image)
138
+
139
+ # Draw bounding boxes on the processed image
140
  img_result = Image.fromarray(enhanced_image)
141
+ img_with_boxes = draw_boxes(img_result, boxes)
142
 
143
+ # Save the image with boxes
144
  result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
145
+ img_with_boxes.save(result_image_path)
 
 
 
146
 
147
+ # Store the text and image result paths
148
  all_extracted_texts[image_path] = result
149
  all_extracted_imgs[image_path] = result_image_path
150
  except ValueError as ve:
 
317
  # Email regex
318
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
319
 
320
+ # URL and links regex, updated to avoid conflicts with email domains
 
 
321
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
322
+
323
  # Find all matches in the text
324
  phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
325
+
326
  emails = email_regex.findall(text)
327
+
328
  links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
329
 
330
  # Remove profile links that might conflict with emails
 
384
  processed_data['email'].extend(cont_data.get("emails", []))
385
  processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
386
  processed_data['Link'].extend(cont_data.get("links_RE", []))
387
+ return processed_data