WebashalarForML commited on
Commit
6fac153
1 Parent(s): 567a60e

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +16 -6
utility/utils.py CHANGED
@@ -5,6 +5,7 @@ from dotenv import load_dotenv
5
  import json
6
  import re
7
  import easyocr
 
8
  from PIL import Image, ImageEnhance, ImageDraw
9
  import cv2
10
  import numpy as np
@@ -22,6 +23,9 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
22
  # Initialize EasyOCR reader for extracting text
23
  reader = easyocr.Reader(['en'])
24
 
 
 
 
25
  def draw_boxes(image, bounds, color='red', width=2):
26
  draw = ImageDraw.Draw(image)
27
  for bound in bounds:
@@ -95,11 +99,12 @@ def ocr_with_paddle(img):
95
  return finaltext
96
 
97
 
98
- def extract_text_from_images(image_paths):
99
  all_extracted_texts = {}
100
  all_extracted_imgs={}
101
  for image_path in image_paths:
102
  # Enhance the image before OCR
 
103
  enhanced_image = process_image(image_path, scale=2)
104
  bounds = reader.readtext(enhanced_image)
105
  # Draw boxes on the processed image
@@ -111,15 +116,18 @@ def extract_text_from_images(image_paths):
111
 
112
  # Perform OCR on the enhanced image
113
  result=ocr_with_paddle(enhanced_image)
114
-
 
 
115
  all_extracted_texts[image_path] =result
116
  all_extracted_imgs[image_path] = result_image_path
117
  # Convert to JSON-compatible structure
118
  all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
119
  return all_extracted_texts,all_extracted_imgs_json
120
 
 
121
  # Function to call the Gemma model and process the output as Json
122
- def Data_Extractor(data, client):
123
  text = f'''Act as a Text extractor for the following text given in text: {data}
124
  extract text in the following output JSON string:
125
  {{
@@ -134,7 +142,7 @@ def Data_Extractor(data, client):
134
  Output:
135
  '''
136
  # Call the API for inference
137
- response = client.text_generation(text, max_new_tokens=600)
138
 
139
  print("parse in text ---:",response)
140
 
@@ -280,9 +288,10 @@ def extract_contact_details(text):
280
  # Email regex
281
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
282
 
283
- # Links regex, updated to avoid conflicts with email domains
 
 
284
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
285
-
286
  # Find all matches in the text
287
  phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
288
  print("phone_numbers--->",phone_numbers)
@@ -320,6 +329,7 @@ def process_extracted_text(extracted_text):
320
  combined_results["links_RE"].extend(contact_details["links_RE"])
321
 
322
  # Convert the combined results to JSON
 
323
  combined_results_json = combined_results
324
 
325
  # Print the final JSON results
 
5
  import json
6
  import re
7
  import easyocr
8
+ import spacy
9
  from PIL import Image, ImageEnhance, ImageDraw
10
  import cv2
11
  import numpy as np
 
23
  # Initialize EasyOCR reader for extracting text
24
  reader = easyocr.Reader(['en'])
25
 
26
+ # Initialize spaCy's English model
27
+ nlp = spacy.load("en_core_web_sm")
28
+
29
  def draw_boxes(image, bounds, color='red', width=2):
30
  draw = ImageDraw.Draw(image)
31
  for bound in bounds:
 
99
  return finaltext
100
 
101
 
102
+ def extract_text_from_images(image_paths, RESULT_FOLDER):
103
  all_extracted_texts = {}
104
  all_extracted_imgs={}
105
  for image_path in image_paths:
106
  # Enhance the image before OCR
107
+ #enhanced_image = load_image(image_path)
108
  enhanced_image = process_image(image_path, scale=2)
109
  bounds = reader.readtext(enhanced_image)
110
  # Draw boxes on the processed image
 
116
 
117
  # Perform OCR on the enhanced image
118
  result=ocr_with_paddle(enhanced_image)
119
+ # results = reader.readtext(enhanced_image)
120
+ # extracted_text = " ".join([res[1] for res in results])
121
+
122
  all_extracted_texts[image_path] =result
123
  all_extracted_imgs[image_path] = result_image_path
124
  # Convert to JSON-compatible structure
125
  all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
126
  return all_extracted_texts,all_extracted_imgs_json
127
 
128
+
129
  # Function to call the Gemma model and process the output as Json
130
+ def Data_Extractor(data, client=client):
131
  text = f'''Act as a Text extractor for the following text given in text: {data}
132
  extract text in the following output JSON string:
133
  {{
 
142
  Output:
143
  '''
144
  # Call the API for inference
145
+ response = client.text_generation(text, max_new_tokens=600)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
146
 
147
  print("parse in text ---:",response)
148
 
 
288
  # Email regex
289
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
290
 
291
+ # Profile links regex, updated to avoid conflicts with email domains
292
+ #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
293
+ #link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
294
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
 
295
  # Find all matches in the text
296
  phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
297
  print("phone_numbers--->",phone_numbers)
 
329
  combined_results["links_RE"].extend(contact_details["links_RE"])
330
 
331
  # Convert the combined results to JSON
332
+ #combined_results_json = json.dumps(combined_results, indent=4)
333
  combined_results_json = combined_results
334
 
335
  # Print the final JSON results