WebashalarForML commited on
Commit
3bc2e7c
1 Parent(s): 11d890f

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +46 -17
utility/utils.py CHANGED
@@ -24,6 +24,8 @@ logging.basicConfig(
24
  os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
25
 
26
  RESULT_FOLDER = 'static/results/'
 
 
27
  if not os.path.exists('/tmp/.paddleocr'):
28
  os.makedirs(RESULT_FOLDER, exist_ok=True)
29
 
@@ -45,11 +47,13 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
45
 
46
  # Load image using OpenCV
47
  def load_image(image_path):
48
- image = cv2.imread(image_path)
49
- if image is None:
50
- raise ValueError(f"Could not load image from {image_path}. It may be corrupted or the path is incorrect.")
51
- return image
52
-
 
 
53
  # Function for upscaling image using OpenCV's INTER_CUBIC
54
  def upscale_image(image, scale=2):
55
  height, width = image.shape[:2]
@@ -171,7 +175,7 @@ def extract_text_from_images(image_paths):
171
  # Function to call the Gemma model and process the output as Json
172
  def Data_Extractor(data, client=client):
173
  text = f'''Act as a Text extractor for the following text given in text: {data}
174
- extract text in the following output JSON string:
175
  {{
176
  "Name": ["Identify and Extract All the person's name from the text."],
177
  "Designation": ["Extract All the designation or job title mentioned in the text."],
@@ -180,17 +184,19 @@ def Data_Extractor(data, client=client):
180
  "Address": ["Extract All the full postal address or location mentioned in the text."],
181
  "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
182
  "Link": ["Identify and Extract any website URLs or social media links present in the text."]
183
- }}
184
- Output:
 
185
  '''
186
  # Call the API for inference
187
- response = client.text_generation(text, max_new_tokens=600)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
188
 
189
  print("parse in text ---:",response)
190
 
191
  # Convert the response text to JSON
192
  try:
193
  json_data = json.loads(response)
 
194
  return json_data
195
  except json.JSONDecodeError as e:
196
  return {"error": f"Error decoding JSON: {e}"}
@@ -228,8 +234,22 @@ def extract_contact_details(text):
228
  \+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
229
  \+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
230
  \+91\s\d{10} | # India Intl +91 XXXXXXXXXX
 
 
 
 
 
 
 
231
  0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
232
  \+91\d{10} | # +91 XXXXXXXXXX
 
 
 
 
 
 
 
233
  \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
234
  \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
235
  0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
@@ -385,16 +405,25 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
385
 
386
  # Initialize the processed data dictionary
387
  processed_data = {
388
- "name": [LLMdata.get('Name', 'Not found')],
389
- "contact_number": [LLMdata.get('Contact', 'Not found')],
390
- "Designation":[LLMdata.get('Designation', 'Not found')],
391
- "email": [LLMdata.get("Email", 'Not found')],
392
- "Location": [LLMdata.get('Address', 'Not found')],
393
- "Link": [LLMdata.get('Link', 'Not found')],
394
- "Company":[LLMdata.get('Company', 'Not found')],
395
  "extracted_text": extracted_text
396
  }
 
 
 
 
 
 
 
 
 
397
  processed_data['email'].extend(cont_data.get("emails", []))
398
  processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
399
  processed_data['Link'].extend(cont_data.get("links_RE", []))
400
- return processed_data
 
24
  os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
25
 
26
  RESULT_FOLDER = 'static/results/'
27
+ JSON_FOLDER = 'static/json/'
28
+
29
  if not os.path.exists('/tmp/.paddleocr'):
30
  os.makedirs(RESULT_FOLDER, exist_ok=True)
31
 
 
47
 
48
  # Load image using OpenCV
49
  def load_image(image_path):
50
+ ext = os.path.splitext(image_path)[1].lower()
51
+ if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']:
52
+ image = cv2.imread(image_path)
53
+ return image
54
+ else:
55
+ raise ValueError(f"Could not load image from {image_path}. It may be corrupted or the path is incorrect. or in a not supported format")
56
+
57
  # Function for upscaling image using OpenCV's INTER_CUBIC
58
  def upscale_image(image, scale=2):
59
  height, width = image.shape[:2]
 
175
  # Function to call the Gemma model and process the output as Json
176
  def Data_Extractor(data, client=client):
177
  text = f'''Act as a Text extractor for the following text given in text: {data}
178
+ Extract text in the following output JSON string:
179
  {{
180
  "Name": ["Identify and Extract All the person's name from the text."],
181
  "Designation": ["Extract All the designation or job title mentioned in the text."],
 
184
  "Address": ["Extract All the full postal address or location mentioned in the text."],
185
  "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
186
  "Link": ["Identify and Extract any website URLs or social media links present in the text."]
187
+ }}
188
+
189
+ Output:
190
  '''
191
  # Call the API for inference
192
+ response = client.text_generation(text, max_new_tokens=1000, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
193
 
194
  print("parse in text ---:",response)
195
 
196
  # Convert the response text to JSON
197
  try:
198
  json_data = json.loads(response)
199
+ print("Json_data-------------->",json_data)
200
  return json_data
201
  except json.JSONDecodeError as e:
202
  return {"error": f"Error decoding JSON: {e}"}
 
234
  \+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
235
  \+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
236
  \+91\s\d{10} | # India Intl +91 XXXXXXXXXX
237
+ \+91\s\d{3}\s\d{3}\s\d{4} | # India Intl +91 XXX XXX XXXX
238
+ \+91\s\d{3}-\d{3}-\d{4} | # India Intl +91 XXX-XXX-XXXX
239
+ \+91\s\d{2}\s\d{4}\s\d{4} | # India Intl +91 XX XXXX XXXX
240
+ \+91\s\d{2}-\d{4}-\d{4} | # India Intl +91 XX-XXXX-XXXX
241
+ \+91\s\d{5}\s\d{5} | # India Intl +91 XXXXX XXXXX
242
+ \d{5}\s\d{5} | # India XXXXX XXXXX
243
+ \d{5}-\d{5} | # India XXXXX-XXXXX
244
  0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
245
  \+91\d{10} | # +91 XXXXXXXXXX
246
+ \d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact
247
+ \d{6}-\d{4} | # XXXXXX-XXXX
248
+ \d{4}-\d{6} | # XXXX-XXXXXX
249
+ \d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
250
+ \d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
251
+ \d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
252
+ \d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #-----
253
  \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
254
  \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
255
  0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
 
405
 
406
  # Initialize the processed data dictionary
407
  processed_data = {
408
+ "name": [],
409
+ "contact_number": [],
410
+ "Designation":[],
411
+ "email": [],
412
+ "Location": [],
413
+ "Link": [],
414
+ "Company":[],
415
  "extracted_text": extracted_text
416
  }
417
+ #LLM
418
+ processed_data['name'].extend(LLMdata.get('Name', []))
419
+ processed_data['contact_number'].extend(LLMdata.get('Contact', []))
420
+ processed_data['Designation'].extend(LLMdata.get('Designation', []))
421
+ processed_data['email'].extend(LLMdata.get("Email", []))
422
+ processed_data['Location'].extend(LLMdata.get('Address', []))
423
+ processed_data['Link'].extend(LLMdata.get('Link', []))
424
+ processed_data['Company'].extend(LLMdata.get('Company', []))
425
+ #Contact
426
  processed_data['email'].extend(cont_data.get("emails", []))
427
  processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
428
  processed_data['Link'].extend(cont_data.get("links_RE", []))
429
+ return processed_data