ImageDataExtractor2

Runtime error

App Files Files Community

WebashalarForML commited on Oct 8, 2024

Commit

3bc2e7c

verified ·

1 Parent(s): 11d890f

Update utility/utils.py

Browse files

Files changed (1) hide show

utility/utils.py +46 -17

utility/utils.py CHANGED Viewed

@@ -24,6 +24,8 @@ logging.basicConfig(
 os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
 RESULT_FOLDER = 'static/results/'
 if not os.path.exists('/tmp/.paddleocr'):
     os.makedirs(RESULT_FOLDER, exist_ok=True)
@@ -45,11 +47,13 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
 # Load image using OpenCV
 def load_image(image_path):
-    image = cv2.imread(image_path)
-    if image is None:
-        raise ValueError(f"Could not load image from {image_path}. It may be corrupted or the path is incorrect.")
-    return image
 # Function for upscaling image using OpenCV's INTER_CUBIC
 def upscale_image(image, scale=2):
     height, width = image.shape[:2]
@@ -171,7 +175,7 @@ def extract_text_from_images(image_paths):
 # Function to call the Gemma model and process the output as Json
 def Data_Extractor(data, client=client):
     text = f'''Act as a  Text extractor for the following text given in text: {data}
-    extract text in the following output JSON string:
     {{
     "Name": ["Identify and Extract All the person's name from the text."],
     "Designation": ["Extract All the designation or job title mentioned in the text."],
@@ -180,17 +184,19 @@ def Data_Extractor(data, client=client):
     "Address": ["Extract All the full postal address or location mentioned in the text."],
     "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
     "Link": ["Identify and Extract any website URLs or social media links present in the text."]
-    }}
-    Output:
     '''
     # Call the API for inference
-    response = client.text_generation(text, max_new_tokens=600)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
     print("parse in text ---:",response)
     # Convert the response text to JSON
     try:
         json_data = json.loads(response)
         return json_data
     except json.JSONDecodeError as e:
         return {"error": f"Error decoding JSON: {e}"}
@@ -228,8 +234,22 @@ def extract_contact_details(text):
         \+91\s\d{5}-\d{5} |                         # India Intl +91 XXXXX-XXXXX
         \+91\s\d{4}-\d{6} |                         # India Intl +91 XXXX-XXXXXX
         \+91\s\d{10} |                              # India Intl +91 XXXXXXXXXX
         0\d{2}-\d{7} |                              # India STD 0XX-XXXXXXX
         \+91\d{10} |                                # +91 XXXXXXXXXX
         \+49\s\d{4}\s\d{8} |                        # Germany Intl +49 XXXX XXXXXXXX
         \+49\s\d{3}\s\d{7} |                        # Germany Intl +49 XXX XXXXXXX
         0\d{3}\s\d{8} |                             # Germany STD 0XXX XXXXXXXX
@@ -385,16 +405,25 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
     # Initialize the processed data dictionary
     processed_data = {
-            "name": [LLMdata.get('Name', 'Not found')],
-            "contact_number": [LLMdata.get('Contact', 'Not found')],
-            "Designation":[LLMdata.get('Designation', 'Not found')],
-            "email": [LLMdata.get("Email", 'Not found')],
-            "Location": [LLMdata.get('Address', 'Not found')],
-            "Link": [LLMdata.get('Link', 'Not found')],
-            "Company":[LLMdata.get('Company', 'Not found')],
             "extracted_text": extracted_text
             }
     processed_data['email'].extend(cont_data.get("emails", []))
     processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
     processed_data['Link'].extend(cont_data.get("links_RE", []))
-    return processed_data

 os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
 RESULT_FOLDER = 'static/results/'
+JSON_FOLDER = 'static/json/'
 if not os.path.exists('/tmp/.paddleocr'):
     os.makedirs(RESULT_FOLDER, exist_ok=True)
 # Load image using OpenCV
 def load_image(image_path):
+    ext = os.path.splitext(image_path)[1].lower()
+    if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']:
+        image = cv2.imread(image_path)
+        return image
+    else:
+        raise ValueError(f"Could not load image from {image_path}. It may be corrupted or the path is incorrect. or in a not supported format")
 # Function for upscaling image using OpenCV's INTER_CUBIC
 def upscale_image(image, scale=2):
     height, width = image.shape[:2]
 # Function to call the Gemma model and process the output as Json
 def Data_Extractor(data, client=client):
     text = f'''Act as a  Text extractor for the following text given in text: {data}
+    Extract text in the following output JSON string:
     {{
     "Name": ["Identify and Extract All the person's name from the text."],
     "Designation": ["Extract All the designation or job title mentioned in the text."],
     "Address": ["Extract All the full postal address or location mentioned in the text."],
     "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
     "Link": ["Identify and Extract any website URLs or social media links present in the text."]
+    }}
+    Output:
     '''
     # Call the API for inference
+    response = client.text_generation(text, max_new_tokens=1000, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
     print("parse in text ---:",response)
     # Convert the response text to JSON
     try:
         json_data = json.loads(response)
+        print("Json_data-------------->",json_data)
         return json_data
     except json.JSONDecodeError as e:
         return {"error": f"Error decoding JSON: {e}"}
         \+91\s\d{5}-\d{5} |                         # India Intl +91 XXXXX-XXXXX
         \+91\s\d{4}-\d{6} |                         # India Intl +91 XXXX-XXXXXX
         \+91\s\d{10} |                              # India Intl +91 XXXXXXXXXX
+        \+91\s\d{3}\s\d{3}\s\d{4} |                 # India Intl +91 XXX XXX XXXX
+        \+91\s\d{3}-\d{3}-\d{4} |                   # India Intl +91 XXX-XXX-XXXX
+        \+91\s\d{2}\s\d{4}\s\d{4} |                 # India Intl +91 XX XXXX XXXX
+        \+91\s\d{2}-\d{4}-\d{4} |                   # India Intl +91 XX-XXXX-XXXX
+        \+91\s\d{5}\s\d{5} |                        # India Intl +91 XXXXX XXXXX
+        \d{5}\s\d{5} |                              # India XXXXX XXXXX
+        \d{5}-\d{5} |                               # India XXXXX-XXXXX
         0\d{2}-\d{7} |                              # India STD 0XX-XXXXXXX
         \+91\d{10} |                                # +91 XXXXXXXXXX
+        \d{10} |                                    # XXXXXXXXXX   # Here is the regex to handle all possible combination of the contact
+        \d{6}-\d{4} |                               # XXXXXX-XXXX
+        \d{4}-\d{6} |                               # XXXX-XXXXXX
+        \d{3}\s\d{3}\s\d{4} |                       # XXX XXX XXXX
+        \d{3}-\d{3}-\d{4} |                         # XXX-XXX-XXXX
+        \d{4}\s\d{3}\s\d{3} |                       # XXXX XXX XXX
+        \d{4}-\d{3}-\d{3} |                         # XXXX-XXX-XXX #-----
         \+49\s\d{4}\s\d{8} |                        # Germany Intl +49 XXXX XXXXXXXX
         \+49\s\d{3}\s\d{7} |                        # Germany Intl +49 XXX XXXXXXX
         0\d{3}\s\d{8} |                             # Germany STD 0XXX XXXXXXXX
     # Initialize the processed data dictionary
     processed_data = {
+            "name": [],
+            "contact_number": [],
+            "Designation":[],
+            "email": [],
+            "Location": [],
+            "Link": [],
+            "Company":[],
             "extracted_text": extracted_text
             }
+    #LLM
+    processed_data['name'].extend(LLMdata.get('Name', []))
+    processed_data['contact_number'].extend(LLMdata.get('Contact', []))
+    processed_data['Designation'].extend(LLMdata.get('Designation', []))
+    processed_data['email'].extend(LLMdata.get("Email", []))
+    processed_data['Location'].extend(LLMdata.get('Address', []))
+    processed_data['Link'].extend(LLMdata.get('Link', []))
+    processed_data['Company'].extend(LLMdata.get('Company', []))
+    #Contact
     processed_data['email'].extend(cont_data.get("emails", []))
     processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
     processed_data['Link'].extend(cont_data.get("links_RE", []))
+    return processed_data