Spaces:
Sleeping
Sleeping
WebashalarForML
commited on
Commit
•
3bc2e7c
1
Parent(s):
11d890f
Update utility/utils.py
Browse files- utility/utils.py +46 -17
utility/utils.py
CHANGED
@@ -24,6 +24,8 @@ logging.basicConfig(
|
|
24 |
os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
|
25 |
|
26 |
RESULT_FOLDER = 'static/results/'
|
|
|
|
|
27 |
if not os.path.exists('/tmp/.paddleocr'):
|
28 |
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
29 |
|
@@ -45,11 +47,13 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
|
45 |
|
46 |
# Load image using OpenCV
|
47 |
def load_image(image_path):
|
48 |
-
|
49 |
-
if
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
53 |
# Function for upscaling image using OpenCV's INTER_CUBIC
|
54 |
def upscale_image(image, scale=2):
|
55 |
height, width = image.shape[:2]
|
@@ -171,7 +175,7 @@ def extract_text_from_images(image_paths):
|
|
171 |
# Function to call the Gemma model and process the output as Json
|
172 |
def Data_Extractor(data, client=client):
|
173 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
174 |
-
|
175 |
{{
|
176 |
"Name": ["Identify and Extract All the person's name from the text."],
|
177 |
"Designation": ["Extract All the designation or job title mentioned in the text."],
|
@@ -180,17 +184,19 @@ def Data_Extractor(data, client=client):
|
|
180 |
"Address": ["Extract All the full postal address or location mentioned in the text."],
|
181 |
"Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
|
182 |
"Link": ["Identify and Extract any website URLs or social media links present in the text."]
|
183 |
-
}}
|
184 |
-
|
|
|
185 |
'''
|
186 |
# Call the API for inference
|
187 |
-
response = client.text_generation(text, max_new_tokens=
|
188 |
|
189 |
print("parse in text ---:",response)
|
190 |
|
191 |
# Convert the response text to JSON
|
192 |
try:
|
193 |
json_data = json.loads(response)
|
|
|
194 |
return json_data
|
195 |
except json.JSONDecodeError as e:
|
196 |
return {"error": f"Error decoding JSON: {e}"}
|
@@ -228,8 +234,22 @@ def extract_contact_details(text):
|
|
228 |
\+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
|
229 |
\+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
|
230 |
\+91\s\d{10} | # India Intl +91 XXXXXXXXXX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
|
232 |
\+91\d{10} | # +91 XXXXXXXXXX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
\+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
|
234 |
\+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
|
235 |
0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
|
@@ -385,16 +405,25 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
|
|
385 |
|
386 |
# Initialize the processed data dictionary
|
387 |
processed_data = {
|
388 |
-
"name": [
|
389 |
-
"contact_number": [
|
390 |
-
"Designation":[
|
391 |
-
"email": [
|
392 |
-
"Location": [
|
393 |
-
"Link": [
|
394 |
-
"Company":[
|
395 |
"extracted_text": extracted_text
|
396 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
processed_data['email'].extend(cont_data.get("emails", []))
|
398 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
399 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
400 |
-
return processed_data
|
|
|
24 |
os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
|
25 |
|
26 |
RESULT_FOLDER = 'static/results/'
|
27 |
+
JSON_FOLDER = 'static/json/'
|
28 |
+
|
29 |
if not os.path.exists('/tmp/.paddleocr'):
|
30 |
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
31 |
|
|
|
47 |
|
48 |
# Load image using OpenCV
|
49 |
def load_image(image_path):
|
50 |
+
ext = os.path.splitext(image_path)[1].lower()
|
51 |
+
if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']:
|
52 |
+
image = cv2.imread(image_path)
|
53 |
+
return image
|
54 |
+
else:
|
55 |
+
raise ValueError(f"Could not load image from {image_path}. It may be corrupted or the path is incorrect. or in a not supported format")
|
56 |
+
|
57 |
# Function for upscaling image using OpenCV's INTER_CUBIC
|
58 |
def upscale_image(image, scale=2):
|
59 |
height, width = image.shape[:2]
|
|
|
175 |
# Function to call the Gemma model and process the output as Json
|
176 |
def Data_Extractor(data, client=client):
|
177 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
178 |
+
Extract text in the following output JSON string:
|
179 |
{{
|
180 |
"Name": ["Identify and Extract All the person's name from the text."],
|
181 |
"Designation": ["Extract All the designation or job title mentioned in the text."],
|
|
|
184 |
"Address": ["Extract All the full postal address or location mentioned in the text."],
|
185 |
"Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
|
186 |
"Link": ["Identify and Extract any website URLs or social media links present in the text."]
|
187 |
+
}}
|
188 |
+
|
189 |
+
Output:
|
190 |
'''
|
191 |
# Call the API for inference
|
192 |
+
response = client.text_generation(text, max_new_tokens=1000, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
|
193 |
|
194 |
print("parse in text ---:",response)
|
195 |
|
196 |
# Convert the response text to JSON
|
197 |
try:
|
198 |
json_data = json.loads(response)
|
199 |
+
print("Json_data-------------->",json_data)
|
200 |
return json_data
|
201 |
except json.JSONDecodeError as e:
|
202 |
return {"error": f"Error decoding JSON: {e}"}
|
|
|
234 |
\+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
|
235 |
\+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
|
236 |
\+91\s\d{10} | # India Intl +91 XXXXXXXXXX
|
237 |
+
\+91\s\d{3}\s\d{3}\s\d{4} | # India Intl +91 XXX XXX XXXX
|
238 |
+
\+91\s\d{3}-\d{3}-\d{4} | # India Intl +91 XXX-XXX-XXXX
|
239 |
+
\+91\s\d{2}\s\d{4}\s\d{4} | # India Intl +91 XX XXXX XXXX
|
240 |
+
\+91\s\d{2}-\d{4}-\d{4} | # India Intl +91 XX-XXXX-XXXX
|
241 |
+
\+91\s\d{5}\s\d{5} | # India Intl +91 XXXXX XXXXX
|
242 |
+
\d{5}\s\d{5} | # India XXXXX XXXXX
|
243 |
+
\d{5}-\d{5} | # India XXXXX-XXXXX
|
244 |
0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
|
245 |
\+91\d{10} | # +91 XXXXXXXXXX
|
246 |
+
\d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact
|
247 |
+
\d{6}-\d{4} | # XXXXXX-XXXX
|
248 |
+
\d{4}-\d{6} | # XXXX-XXXXXX
|
249 |
+
\d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
|
250 |
+
\d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
|
251 |
+
\d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
|
252 |
+
\d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #-----
|
253 |
\+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
|
254 |
\+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
|
255 |
0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
|
|
|
405 |
|
406 |
# Initialize the processed data dictionary
|
407 |
processed_data = {
|
408 |
+
"name": [],
|
409 |
+
"contact_number": [],
|
410 |
+
"Designation":[],
|
411 |
+
"email": [],
|
412 |
+
"Location": [],
|
413 |
+
"Link": [],
|
414 |
+
"Company":[],
|
415 |
"extracted_text": extracted_text
|
416 |
}
|
417 |
+
#LLM
|
418 |
+
processed_data['name'].extend(LLMdata.get('Name', []))
|
419 |
+
processed_data['contact_number'].extend(LLMdata.get('Contact', []))
|
420 |
+
processed_data['Designation'].extend(LLMdata.get('Designation', []))
|
421 |
+
processed_data['email'].extend(LLMdata.get("Email", []))
|
422 |
+
processed_data['Location'].extend(LLMdata.get('Address', []))
|
423 |
+
processed_data['Link'].extend(LLMdata.get('Link', []))
|
424 |
+
processed_data['Company'].extend(LLMdata.get('Company', []))
|
425 |
+
#Contact
|
426 |
processed_data['email'].extend(cont_data.get("emails", []))
|
427 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
428 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
429 |
+
return processed_data
|