WebashalarForML commited on
Commit
d8b7b87
1 Parent(s): 627c9e7

Create utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +352 -0
utility/utils.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # libraries
2
+ import os
3
+ from huggingface_hub import InferenceClient
4
+ from dotenv import load_dotenv
5
+ import json
6
+ import re
7
+ import easyocr
8
+ import spacy
9
+ from PIL import Image, ImageEnhance, ImageDraw
10
+ import cv2
11
+ import numpy as np
12
+ from paddleocr import PaddleOCR
13
+
14
+ # Load environment variables from .env file
15
+ load_dotenv()
16
+
17
+ # Authenticate with Hugging Face
18
+ HFT = os.getenv('HF_TOKEN')
19
+
20
+ # Initialize the InferenceClient
21
+ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
22
+
23
+ # Initialize EasyOCR reader for extracting text
24
+ reader = easyocr.Reader(['en'])
25
+
26
+ # Initialize spaCy's English model
27
+ nlp = spacy.load("en_core_web_sm")
28
+
29
+ def draw_boxes(image, bounds, color='red', width=2):
30
+ draw = ImageDraw.Draw(image)
31
+ for bound in bounds:
32
+ p0, p1, p2, p3 = bound[0]
33
+ draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
34
+ return image
35
+
36
+ #Image Quality upscaling
37
+ # Load image using OpenCV
38
+ def load_image(image_path):
39
+ return cv2.imread(image_path)
40
+
41
+ # Function for upscaling image using OpenCV's INTER_CUBIC or ESRGAN (if available)
42
+ def upscale_image(image, scale=2):
43
+ height, width = image.shape[:2]
44
+ # Simple upscaling using cubic interpolation
45
+ upscaled_image = cv2.resize(image, (width * scale, height * scale), interpolation=cv2.INTER_CUBIC)
46
+ return upscaled_image
47
+
48
+ # Function to denoise the image (reduce noise)
49
+ def reduce_noise(image):
50
+ return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
51
+
52
+ # Function to sharpen the image
53
+ def sharpen_image(image):
54
+ kernel = np.array([[0, -1, 0],
55
+ [-1, 5, -1],
56
+ [0, -1, 0]])
57
+ sharpened_image = cv2.filter2D(image, -1, kernel)
58
+ return sharpened_image
59
+
60
+ # Function to increase contrast and enhance details without changing color
61
+ def enhance_image(image):
62
+ # Convert from BGR to RGB for PIL processing, then back to BGR
63
+ pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
64
+ enhancer = ImageEnhance.Contrast(pil_img)
65
+ enhanced_image = enhancer.enhance(1.5)
66
+ # Convert back to BGR
67
+ enhanced_image_bgr = cv2.cvtColor(np.array(enhanced_image), cv2.COLOR_RGB2BGR)
68
+ return enhanced_image_bgr
69
+
70
+ # Complete function to process image
71
+ def process_image(image_path, scale=2):
72
+ # Load the image
73
+ image = load_image(image_path)
74
+
75
+ # Upscale the image
76
+ upscaled_image = upscale_image(image, scale)
77
+
78
+ # Reduce noise
79
+ denoised_image = reduce_noise(upscaled_image)
80
+
81
+ # Sharpen the image
82
+ sharpened_image = sharpen_image(denoised_image)
83
+
84
+ # Enhance the image contrast and details without changing color
85
+ final_image = enhance_image(sharpened_image)
86
+
87
+ return final_image
88
+
89
+
90
+ def ocr_with_paddle(img):
91
+ finaltext = ''
92
+ ocr = PaddleOCR(lang='en', use_angle_cls=True)
93
+ # img_path = 'exp.jpeg'
94
+ result = ocr.ocr(img)
95
+
96
+ for i in range(len(result[0])):
97
+ text = result[0][i][1][0]
98
+ finaltext += ' '+ text
99
+ return finaltext
100
+
101
+
102
+ def extract_text_from_images(image_paths):
103
+ all_extracted_texts = {}
104
+ all_extracted_imgs={}
105
+ for image_path in image_paths:
106
+ # Enhance the image before OCR
107
+ enhanced_image = process_image(image_path, scale=2)
108
+ bounds = reader.readtext(enhanced_image)
109
+ # Draw boxes on the processed image
110
+ img_result = Image.fromarray(enhanced_image)
111
+ draw_boxes(img_result, bounds)
112
+
113
+ result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
114
+ img_result.save(result_image_path) # Save the processed image
115
+
116
+ # Perform OCR on the enhanced image
117
+ result=ocr_with_paddle(enhanced_image)
118
+
119
+ all_extracted_texts[image_path] =result
120
+ all_extracted_imgs[image_path] = result_image_path
121
+ # Convert to JSON-compatible structure
122
+ all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
123
+ return all_extracted_texts,all_extracted_imgs_json
124
+
125
+ # Function to call the Gemma model and process the output as Json
126
+ def Data_Extractor(data, client):
127
+ text = f'''Act as a Text extractor for the following text given in text: {data}
128
+ extract text in the following output JSON string:
129
+ {{
130
+ "Name": ["Identify and Extract All the person's name from the text."],
131
+ "Designation": ["Extract All the designation or job title mentioned in the text."],
132
+ "Company": ["Extract All the company or organization name if mentioned."],
133
+ "Contact": ["Extract All phone number, including country codes if present."],
134
+ "Address": ["Extract All the full postal address or location mentioned in the text."],
135
+ "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
136
+ "Link": ["Identify and Extract any website URLs or social media links present in the text."]
137
+ }}
138
+ Output:
139
+ '''
140
+ # Call the API for inference
141
+ response = client.text_generation(text, max_new_tokens=600)
142
+
143
+ print("parse in text ---:",response)
144
+
145
+ # Convert the response text to JSON
146
+ try:
147
+ json_data = json.loads(response)
148
+ return json_data
149
+ except json.JSONDecodeError as e:
150
+ return {"error": f"Error decoding JSON: {e}"}
151
+
152
+ # For have text compatible to the llm
153
+ def json_to_llm_str(textJson):
154
+ str=''
155
+ for file,item in textJson.items():
156
+ str+=item + ' '
157
+ return str
158
+
159
+ # Define the RE for extracting the contact details like number, mail , portfolio, website etc
160
+ def extract_contact_details(text):
161
+ # Regex patterns
162
+ # Phone numbers with at least 5 digits in any segment
163
+ combined_phone_regex = re.compile(r'''
164
+ (?:
165
+ #(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ |
166
+ \+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
167
+ \(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
168
+ \(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
169
+ \(\d{3}\)\s\d{3}\s\d{3} | # USA/Canada (XXX) XXX XXX
170
+ \+1\d{10} | # +1 XXXXXXXXXX
171
+ \d{10} | # XXXXXXXXXX
172
+ \+44\s\d{4}\s\d{6} | # UK Intl +44 XXXX XXXXXX
173
+ \+44\s\d{3}\s\d{3}\s\d{4} | # UK Intl +44 XXX XXX XXXX
174
+ 0\d{4}\s\d{6} | # UK STD 0XXXX XXXXXX
175
+ 0\d{3}\s\d{3}\s\d{4} | # UK STD 0XXX XXX XXXX
176
+ \+44\d{10} | # +44 XXXXXXXXXX
177
+ 0\d{10} | # 0XXXXXXXXXX
178
+ \+61\s\d\s\d{4}\s\d{4} | # Australia Intl +61 X XXXX XXXX
179
+ 0\d\s\d{4}\s\d{4} | # Australia STD 0X XXXX XXXX
180
+ \+61\d{9} | # +61 XXXXXXXXX
181
+ 0\d{9} | # 0XXXXXXXXX
182
+ \+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
183
+ \+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
184
+ \+91\s\d{10} | # India Intl +91 XXXXXXXXXX
185
+ 0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
186
+ \+91\d{10} | # +91 XXXXXXXXXX
187
+ \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
188
+ \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
189
+ 0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
190
+ \+49\d{12} | # +49 XXXXXXXXXXXX
191
+ \+49\d{10} | # +49 XXXXXXXXXX
192
+ 0\d{11} | # 0XXXXXXXXXXX
193
+ \+86\s\d{3}\s\d{4}\s\d{4} | # China Intl +86 XXX XXXX XXXX
194
+ 0\d{3}\s\d{4}\s\d{4} | # China STD 0XXX XXXX XXXX
195
+ \+86\d{11} | # +86 XXXXXXXXXXX
196
+ \+81\s\d\s\d{4}\s\d{4} | # Japan Intl +81 X XXXX XXXX
197
+ \+81\s\d{2}\s\d{4}\s\d{4} | # Japan Intl +81 XX XXXX XXXX
198
+ 0\d\s\d{4}\s\d{4} | # Japan STD 0X XXXX XXXX
199
+ \+81\d{10} | # +81 XXXXXXXXXX
200
+ \+81\d{9} | # +81 XXXXXXXXX
201
+ 0\d{9} | # 0XXXXXXXXX
202
+ \+55\s\d{2}\s\d{5}-\d{4} | # Brazil Intl +55 XX XXXXX-XXXX
203
+ \+55\s\d{2}\s\d{4}-\d{4} | # Brazil Intl +55 XX XXXX-XXXX
204
+ 0\d{2}\s\d{4}\s\d{4} | # Brazil STD 0XX XXXX XXXX
205
+ \+55\d{11} | # +55 XXXXXXXXXXX
206
+ \+55\d{10} | # +55 XXXXXXXXXX
207
+ 0\d{10} | # 0XXXXXXXXXX
208
+ \+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France Intl +33 X XX XX XX XX
209
+ 0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France STD 0X XX XX XX XX
210
+ \+33\d{9} | # +33 XXXXXXXXX
211
+ 0\d{9} | # 0XXXXXXXXX
212
+ \+7\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia Intl +7 XXX XXX-XX-XX
213
+ 8\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia STD 8 XXX XXX-XX-XX
214
+ \+7\d{10} | # +7 XXXXXXXXXX
215
+ 8\d{10} | # 8 XXXXXXXXXX
216
+ \+27\s\d{2}\s\d{3}\s\d{4} | # South Africa Intl +27 XX XXX XXXX
217
+ 0\d{2}\s\d{3}\s\d{4} | # South Africa STD 0XX XXX XXXX
218
+ \+27\d{9} | # +27 XXXXXXXXX
219
+ 0\d{9} | # 0XXXXXXXXX
220
+ \+52\s\d{3}\s\d{3}\s\d{4} | # Mexico Intl +52 XXX XXX XXXX
221
+ \+52\s\d{2}\s\d{4}\s\d{4} | # Mexico Intl +52 XX XXXX XXXX
222
+ 01\s\d{3}\s\d{4} | # Mexico STD 01 XXX XXXX
223
+ \+52\d{10} | # +52 XXXXXXXXXX
224
+ 01\d{7} | # 01 XXXXXXX
225
+ \+234\s\d{3}\s\d{3}\s\d{4} | # Nigeria Intl +234 XXX XXX XXXX
226
+ 0\d{3}\s\d{3}\s\d{4} | # Nigeria STD 0XXX XXX XXXX
227
+ \+234\d{10} | # +234 XXXXXXXXXX
228
+ 0\d{10} | # 0XXXXXXXXXX
229
+ \+971\s\d\s\d{3}\s\d{4} | # UAE Intl +971 X XXX XXXX
230
+ 0\d\s\d{3}\s\d{4} | # UAE STD 0X XXX XXXX
231
+ \+971\d{8} | # +971 XXXXXXXX
232
+ 0\d{8} | # 0XXXXXXXX
233
+ \+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
234
+ \+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
235
+ 0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
236
+ \+54\d{10} | # +54 9 XXXXXXXXXX
237
+ \+54\d{9} | # +54 XXXXXXXXX
238
+ 0\d{7} | # 0XXXXXXX
239
+ \+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
240
+ 0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
241
+ \+966\d{8} | # +966 XXXXXXXX
242
+ 0\d{8} | # 0XXXXXXXX
243
+ \+1\d{10} | # +1 XXXXXXXXXX
244
+ \+1\s\d{3}\s\d{3}\s\d{4} | # +1 XXX XXX XXXX
245
+ \d{5}\s\d{5} | # XXXXX XXXXX
246
+ \d{10} | # XXXXXXXXXX
247
+ \+44\d{10} | # +44 XXXXXXXXXX
248
+ 0\d{10} | # 0XXXXXXXXXX
249
+ \+61\d{9} | # +61 XXXXXXXXX
250
+ 0\d{9} | # 0XXXXXXXXX
251
+ \+91\d{10} | # +91 XXXXXXXXXX
252
+ \+49\d{12} | # +49 XXXXXXXXXXXX
253
+ \+49\d{10} | # +49 XXXXXXXXXX
254
+ 0\d{11} | # 0XXXXXXXXXXX
255
+ \+86\d{11} | # +86 XXXXXXXXXXX
256
+ \+81\d{10} | # +81 XXXXXXXXXX
257
+ \+81\d{9} | # +81 XXXXXXXXX
258
+ 0\d{9} | # 0XXXXXXXXX
259
+ \+55\d{11} | # +55 XXXXXXXXXXX
260
+ \+55\d{10} | # +55 XXXXXXXXXX
261
+ 0\d{10} | # 0XXXXXXXXXX
262
+ \+33\d{9} | # +33 XXXXXXXXX
263
+ 0\d{9} | # 0XXXXXXXXX
264
+ \+7\d{10} | # +7 XXXXXXXXXX
265
+ 8\d{10} | # 8 XXXXXXXXXX
266
+ \+27\d{9} | # +27 XXXXXXXXX
267
+ 0\d{9} | # 0XXXXXXXXX (South Africa STD)
268
+ \+52\d{10} | # +52 XXXXXXXXXX
269
+ 01\d{7} | # 01 XXXXXXX
270
+ \+234\d{10} | # +234 XXXXXXXXXX
271
+ 0\d{10} | # 0XXXXXXXXXX
272
+ \+971\d{8} | # +971 XXXXXXXX
273
+ 0\d{8} | # 0XXXXXXXX
274
+ \+54\s9\s\d{10} | # +54 9 XXXXXXXXXX
275
+ \+54\d{9} | # +54 XXXXXXXXX
276
+ 0\d{7} | # 0XXXXXXX
277
+ \+966\d{8} | # +966 XXXXXXXX
278
+ 0\d{8} # 0XXXXXXXX
279
+ \+\d{3}-\d{3}-\d{4}
280
+ )
281
+
282
+ ''',re.VERBOSE)
283
+
284
+ # Email regex
285
+ email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
286
+
287
+ # Links regex, updated to avoid conflicts with email domains
288
+ link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
289
+
290
+ # Find all matches in the text
291
+ phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
292
+ print("phone_numbers--->",phone_numbers)
293
+ emails = email_regex.findall(text)
294
+ links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
295
+
296
+ # Remove profile links that might conflict with emails
297
+ links_RE = [link for link in links_RE if not any(email in link for email in emails)]
298
+
299
+ return {
300
+ "phone_numbers": phone_numbers,
301
+ "emails": emails,
302
+ "links_RE": links_RE
303
+ }
304
+
305
+ # preprocessing the data
306
+ def process_extracted_text(extracted_text):
307
+ # Load JSON data
308
+ data = json.dumps(extracted_text, indent=4)
309
+ data = json.loads(data)
310
+
311
+ # Create a single dictionary to hold combined results
312
+ combined_results = {
313
+ "phone_numbers": [],
314
+ "emails": [],
315
+ "links_RE": []
316
+ }
317
+
318
+ # Process each text entry
319
+ for filename, text in data.items():
320
+ contact_details = extract_contact_details(text)
321
+ # Extend combined results with the details from this file
322
+ combined_results["phone_numbers"].extend(contact_details["phone_numbers"])
323
+ combined_results["emails"].extend(contact_details["emails"])
324
+ combined_results["links_RE"].extend(contact_details["links_RE"])
325
+
326
+ # Convert the combined results to JSON
327
+ combined_results_json = combined_results
328
+
329
+ # Print the final JSON results
330
+ print("Combined contact details in JSON format:")
331
+ print(combined_results_json)
332
+
333
+ return combined_results_json
334
+
335
+ # Process the model output for parsed result
336
+ def process_resume_data(LLMdata,cont_data,extracted_text):
337
+
338
+ # Initialize the processed data dictionary
339
+ processed_data = {
340
+ "name": [LLMdata.get('Name', 'Not found')],
341
+ "contact_number": [LLMdata.get('Contact', 'Not found')],
342
+ "Designation":[LLMdata.get('Designation', 'Not found')],
343
+ "email": [LLMdata.get("Email", 'Not found')],
344
+ "Location": [LLMdata.get('Address', 'Not found')],
345
+ "Link": [LLMdata.get('Link', 'Not found')],
346
+ "Company":[LLMdata.get('Company', 'Not found')],
347
+ "extracted_text": extracted_text
348
+ }
349
+ processed_data['email'].extend(cont_data.get("emails", []))
350
+ processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
351
+ processed_data['Link'].extend(cont_data.get("links_RE", []))
352
+ return processed_data