Spaces:
Sleeping
Sleeping
WebashalarForML
commited on
Update utility/utils.py
Browse files- utility/utils.py +30 -31
utility/utils.py
CHANGED
@@ -43,19 +43,6 @@ HFT = os.getenv('HF_TOKEN')
|
|
43 |
# Initialize the InferenceClient
|
44 |
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
45 |
|
46 |
-
# Specify a custom model storage directory (ensure this path is writable)
|
47 |
-
#model_storage_directory = '/app/models'
|
48 |
-
|
49 |
-
# Create the reader object and set the model storage directory
|
50 |
-
#reader = easyocr.Reader(['en'], model_storage_directory=model_storage_directory)
|
51 |
-
|
52 |
-
def draw_boxes(image, bounds, color='red', width=2):
|
53 |
-
draw = ImageDraw.Draw(image)
|
54 |
-
for bound in bounds:
|
55 |
-
p0, p1, p2, p3 = bound[0]
|
56 |
-
draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
|
57 |
-
return image
|
58 |
-
|
59 |
# Load image using OpenCV
|
60 |
def load_image(image_path):
|
61 |
image = cv2.imread(image_path)
|
@@ -108,11 +95,10 @@ def process_image(image_path, scale=2):
|
|
108 |
|
109 |
return final_image
|
110 |
|
|
|
111 |
def ocr_with_paddle(img):
|
112 |
-
|
113 |
-
|
114 |
-
#ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=model_dir)
|
115 |
-
#ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=os.environ['PADDLEOCR_HOME'])
|
116 |
logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
|
117 |
ocr = PaddleOCR(
|
118 |
lang='en',
|
@@ -123,12 +109,22 @@ def ocr_with_paddle(img):
|
|
123 |
)
|
124 |
|
125 |
result = ocr.ocr(img)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
131 |
|
|
|
132 |
def extract_text_from_images(image_paths):
|
133 |
all_extracted_texts = {}
|
134 |
all_extracted_imgs = {}
|
@@ -137,15 +133,18 @@ def extract_text_from_images(image_paths):
|
|
137 |
# Enhance the image before OCR
|
138 |
enhanced_image = process_image(image_path, scale=2)
|
139 |
|
140 |
-
#
|
|
|
|
|
|
|
141 |
img_result = Image.fromarray(enhanced_image)
|
|
|
142 |
|
|
|
143 |
result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
|
144 |
-
|
145 |
-
|
146 |
-
# Perform OCR on the enhanced image
|
147 |
-
result = ocr_with_paddle(enhanced_image)
|
148 |
|
|
|
149 |
all_extracted_texts[image_path] = result
|
150 |
all_extracted_imgs[image_path] = result_image_path
|
151 |
except ValueError as ve:
|
@@ -318,14 +317,14 @@ def extract_contact_details(text):
|
|
318 |
# Email regex
|
319 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
320 |
|
321 |
-
#
|
322 |
-
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
|
323 |
-
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
|
324 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
|
|
325 |
# Find all matches in the text
|
326 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
327 |
-
|
328 |
emails = email_regex.findall(text)
|
|
|
329 |
links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
|
330 |
|
331 |
# Remove profile links that might conflict with emails
|
@@ -385,4 +384,4 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
|
|
385 |
processed_data['email'].extend(cont_data.get("emails", []))
|
386 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
387 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
388 |
-
return processed_data
|
|
|
43 |
# Initialize the InferenceClient
|
44 |
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
# Load image using OpenCV
|
47 |
def load_image(image_path):
|
48 |
image = cv2.imread(image_path)
|
|
|
95 |
|
96 |
return final_image
|
97 |
|
98 |
+
# Function for OCR with PaddleOCR, returning both text and bounding boxes
|
99 |
def ocr_with_paddle(img):
|
100 |
+
final_text = ''
|
101 |
+
|
|
|
|
|
102 |
logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
|
103 |
ocr = PaddleOCR(
|
104 |
lang='en',
|
|
|
109 |
)
|
110 |
|
111 |
result = ocr.ocr(img)
|
112 |
+
boxes = []
|
113 |
+
for line in result[0]:
|
114 |
+
box, text, _ = line
|
115 |
+
boxes.append(box) # Append the bounding box
|
116 |
+
final_text += ' ' + text
|
117 |
+
|
118 |
+
return final_text, boxes
|
119 |
|
120 |
+
# Function to draw bounding boxes around text
|
121 |
+
def draw_boxes(image, boxes):
|
122 |
+
draw = ImageDraw.Draw(image)
|
123 |
+
for box in boxes:
|
124 |
+
draw.polygon(box, outline="red", width=3)
|
125 |
+
return image
|
126 |
|
127 |
+
# Extract text and create a result image with bounding boxes
|
128 |
def extract_text_from_images(image_paths):
|
129 |
all_extracted_texts = {}
|
130 |
all_extracted_imgs = {}
|
|
|
133 |
# Enhance the image before OCR
|
134 |
enhanced_image = process_image(image_path, scale=2)
|
135 |
|
136 |
+
# Perform OCR on the enhanced image and get boxes
|
137 |
+
result, boxes = ocr_with_paddle(enhanced_image)
|
138 |
+
|
139 |
+
# Draw bounding boxes on the processed image
|
140 |
img_result = Image.fromarray(enhanced_image)
|
141 |
+
img_with_boxes = draw_boxes(img_result, boxes)
|
142 |
|
143 |
+
# Save the image with boxes
|
144 |
result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
|
145 |
+
img_with_boxes.save(result_image_path)
|
|
|
|
|
|
|
146 |
|
147 |
+
# Store the text and image result paths
|
148 |
all_extracted_texts[image_path] = result
|
149 |
all_extracted_imgs[image_path] = result_image_path
|
150 |
except ValueError as ve:
|
|
|
317 |
# Email regex
|
318 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
319 |
|
320 |
+
# URL and links regex, updated to avoid conflicts with email domains
|
|
|
|
|
321 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
322 |
+
|
323 |
# Find all matches in the text
|
324 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
325 |
+
|
326 |
emails = email_regex.findall(text)
|
327 |
+
|
328 |
links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
|
329 |
|
330 |
# Remove profile links that might conflict with emails
|
|
|
384 |
processed_data['email'].extend(cont_data.get("emails", []))
|
385 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
386 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
387 |
+
return processed_data
|