Spaces:
Sleeping
Sleeping
WebashalarForML
commited on
Update utility/utils.py
Browse files- utility/utils.py +16 -6
utility/utils.py
CHANGED
@@ -5,6 +5,7 @@ from dotenv import load_dotenv
|
|
5 |
import json
|
6 |
import re
|
7 |
import easyocr
|
|
|
8 |
from PIL import Image, ImageEnhance, ImageDraw
|
9 |
import cv2
|
10 |
import numpy as np
|
@@ -22,6 +23,9 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
|
22 |
# Initialize EasyOCR reader for extracting text
|
23 |
reader = easyocr.Reader(['en'])
|
24 |
|
|
|
|
|
|
|
25 |
def draw_boxes(image, bounds, color='red', width=2):
|
26 |
draw = ImageDraw.Draw(image)
|
27 |
for bound in bounds:
|
@@ -95,11 +99,12 @@ def ocr_with_paddle(img):
|
|
95 |
return finaltext
|
96 |
|
97 |
|
98 |
-
def extract_text_from_images(image_paths):
|
99 |
all_extracted_texts = {}
|
100 |
all_extracted_imgs={}
|
101 |
for image_path in image_paths:
|
102 |
# Enhance the image before OCR
|
|
|
103 |
enhanced_image = process_image(image_path, scale=2)
|
104 |
bounds = reader.readtext(enhanced_image)
|
105 |
# Draw boxes on the processed image
|
@@ -111,15 +116,18 @@ def extract_text_from_images(image_paths):
|
|
111 |
|
112 |
# Perform OCR on the enhanced image
|
113 |
result=ocr_with_paddle(enhanced_image)
|
114 |
-
|
|
|
|
|
115 |
all_extracted_texts[image_path] =result
|
116 |
all_extracted_imgs[image_path] = result_image_path
|
117 |
# Convert to JSON-compatible structure
|
118 |
all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
|
119 |
return all_extracted_texts,all_extracted_imgs_json
|
120 |
|
|
|
121 |
# Function to call the Gemma model and process the output as Json
|
122 |
-
def Data_Extractor(data, client):
|
123 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
124 |
extract text in the following output JSON string:
|
125 |
{{
|
@@ -134,7 +142,7 @@ def Data_Extractor(data, client):
|
|
134 |
Output:
|
135 |
'''
|
136 |
# Call the API for inference
|
137 |
-
response = client.text_generation(text, max_new_tokens=600)
|
138 |
|
139 |
print("parse in text ---:",response)
|
140 |
|
@@ -280,9 +288,10 @@ def extract_contact_details(text):
|
|
280 |
# Email regex
|
281 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
282 |
|
283 |
-
#
|
|
|
|
|
284 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
285 |
-
|
286 |
# Find all matches in the text
|
287 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
288 |
print("phone_numbers--->",phone_numbers)
|
@@ -320,6 +329,7 @@ def process_extracted_text(extracted_text):
|
|
320 |
combined_results["links_RE"].extend(contact_details["links_RE"])
|
321 |
|
322 |
# Convert the combined results to JSON
|
|
|
323 |
combined_results_json = combined_results
|
324 |
|
325 |
# Print the final JSON results
|
|
|
5 |
import json
|
6 |
import re
|
7 |
import easyocr
|
8 |
+
import spacy
|
9 |
from PIL import Image, ImageEnhance, ImageDraw
|
10 |
import cv2
|
11 |
import numpy as np
|
|
|
23 |
# Initialize EasyOCR reader for extracting text
|
24 |
reader = easyocr.Reader(['en'])
|
25 |
|
26 |
+
# Initialize spaCy's English model
|
27 |
+
nlp = spacy.load("en_core_web_sm")
|
28 |
+
|
29 |
def draw_boxes(image, bounds, color='red', width=2):
|
30 |
draw = ImageDraw.Draw(image)
|
31 |
for bound in bounds:
|
|
|
99 |
return finaltext
|
100 |
|
101 |
|
102 |
+
def extract_text_from_images(image_paths, RESULT_FOLDER):
|
103 |
all_extracted_texts = {}
|
104 |
all_extracted_imgs={}
|
105 |
for image_path in image_paths:
|
106 |
# Enhance the image before OCR
|
107 |
+
#enhanced_image = load_image(image_path)
|
108 |
enhanced_image = process_image(image_path, scale=2)
|
109 |
bounds = reader.readtext(enhanced_image)
|
110 |
# Draw boxes on the processed image
|
|
|
116 |
|
117 |
# Perform OCR on the enhanced image
|
118 |
result=ocr_with_paddle(enhanced_image)
|
119 |
+
# results = reader.readtext(enhanced_image)
|
120 |
+
# extracted_text = " ".join([res[1] for res in results])
|
121 |
+
|
122 |
all_extracted_texts[image_path] =result
|
123 |
all_extracted_imgs[image_path] = result_image_path
|
124 |
# Convert to JSON-compatible structure
|
125 |
all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
|
126 |
return all_extracted_texts,all_extracted_imgs_json
|
127 |
|
128 |
+
|
129 |
# Function to call the Gemma model and process the output as Json
|
130 |
+
def Data_Extractor(data, client=client):
|
131 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
132 |
extract text in the following output JSON string:
|
133 |
{{
|
|
|
142 |
Output:
|
143 |
'''
|
144 |
# Call the API for inference
|
145 |
+
response = client.text_generation(text, max_new_tokens=600)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
|
146 |
|
147 |
print("parse in text ---:",response)
|
148 |
|
|
|
288 |
# Email regex
|
289 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
290 |
|
291 |
+
# Profile links regex, updated to avoid conflicts with email domains
|
292 |
+
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
|
293 |
+
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
|
294 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
|
|
295 |
# Find all matches in the text
|
296 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
297 |
print("phone_numbers--->",phone_numbers)
|
|
|
329 |
combined_results["links_RE"].extend(contact_details["links_RE"])
|
330 |
|
331 |
# Convert the combined results to JSON
|
332 |
+
#combined_results_json = json.dumps(combined_results, indent=4)
|
333 |
combined_results_json = combined_results
|
334 |
|
335 |
# Print the final JSON results
|