import cv2 import torch import warnings import numpy as np from PIL import Image from math import sqrt import mediapipe as mp from transformers import pipeline warnings.filterwarnings("ignore") class ExtractorMediaPipe: def __init__(self, upscale=1): self.upscale = int(upscale) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # ========== Face Extraction ========== self.face_detector = mp.solutions.face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5) self.face_mesh = mp.solutions.face_mesh.FaceMesh( max_num_faces=1, static_image_mode=True, refine_landmarks=True, min_detection_confidence=0.5, min_tracking_confidence=0.5, ) # ========== Eyes Extraction ========== self.RIGHT_EYE = [ 362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398, ] self.LEFT_EYE = [ 33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246, ] # https://huggingface.co/dima806/closed_eyes_image_detection # https://www.kaggle.com/code/dima806/closed-eye-image-detection-vit self.pipe = pipeline( "image-classification", model="dima806/closed_eyes_image_detection", device=self.device, ) self.blink_lower_thresh = 0.22 self.blink_upper_thresh = 0.25 self.blink_confidence = 0.50 # ========== Iris Extraction ========== self.RIGHT_IRIS = [474, 475, 476, 477] self.LEFT_IRIS = [469, 470, 471, 472] def extract_face(self, image): tmp_image = image.copy() results = self.face_detector.process(tmp_image) if not results.detections: # print("No face detected") return None else: bboxC = results.detections[0].location_data.relative_bounding_box ih, iw, _ = image.shape # Get bounding box coordinates x, y, w, h = ( int(bboxC.xmin * iw), int(bboxC.ymin * ih), int(bboxC.width * iw), int(bboxC.height * ih), ) # Calculate the center of the bounding box center_x = x + w // 2 center_y = y + h // 2 # Calculate new bounds ensuring they fit within the image dimensions half_size = 128 * self.upscale x1 = max(center_x - half_size, 0) y1 = max(center_y - half_size, 0) x2 = min(center_x + half_size, iw) y2 = min(center_y + half_size, ih) # Adjust x1, x2, y1, and y2 to ensure the cropped region is exactly (256 * self.upscale) x (256 * self.upscale) if x2 - x1 < (256 * self.upscale): if x1 == 0: x2 = min((256 * self.upscale), iw) elif x2 == iw: x1 = max(iw - (256 * self.upscale), 0) if y2 - y1 < (256 * self.upscale): if y1 == 0: y2 = min((256 * self.upscale), ih) elif y2 == ih: y1 = max(ih - (256 * self.upscale), 0) cropped_face = image[y1:y2, x1:x2] # bicubic upsampling # if self.upscale != 1: # cropped_face = cv2.resize( # cropped_face, # (256 * self.upscale, 256 * self.upscale), # interpolation=cv2.INTER_CUBIC, # ) return cropped_face @staticmethod def landmarksDetection(image, results, draw=False): image_height, image_width = image.shape[:2] mesh_coordinates = [ (int(point.x * image_width), int(point.y * image_height)) for point in results.multi_face_landmarks[0].landmark ] if draw: [cv2.circle(image, i, 2, (0, 255, 0), -1) for i in mesh_coordinates] return mesh_coordinates @staticmethod def euclideanDistance(point, point1): x, y = point x1, y1 = point1 distance = sqrt((x1 - x) ** 2 + (y1 - y) ** 2) return distance def blinkRatio(self, landmarks, right_indices, left_indices): right_eye_landmark1 = landmarks[right_indices[0]] right_eye_landmark2 = landmarks[right_indices[8]] right_eye_landmark3 = landmarks[right_indices[12]] right_eye_landmark4 = landmarks[right_indices[4]] left_eye_landmark1 = landmarks[left_indices[0]] left_eye_landmark2 = landmarks[left_indices[8]] left_eye_landmark3 = landmarks[left_indices[12]] left_eye_landmark4 = landmarks[left_indices[4]] right_eye_horizontal_distance = self.euclideanDistance(right_eye_landmark1, right_eye_landmark2) right_eye_vertical_distance = self.euclideanDistance(right_eye_landmark3, right_eye_landmark4) left_eye_vertical_distance = self.euclideanDistance(left_eye_landmark3, left_eye_landmark4) left_eye_horizontal_distance = self.euclideanDistance(left_eye_landmark1, left_eye_landmark2) right_eye_ratio = right_eye_vertical_distance / right_eye_horizontal_distance left_eye_ratio = left_eye_vertical_distance / left_eye_horizontal_distance eyes_ratio = (right_eye_ratio + left_eye_ratio) / 2 return eyes_ratio def extract_eyes_regions(self, image, landmarks, eye_indices): h, w, _ = image.shape points = [(int(landmarks[idx].x * w), int(landmarks[idx].y * h)) for idx in eye_indices] x_min = min([p[0] for p in points]) x_max = max([p[0] for p in points]) y_min = min([p[1] for p in points]) y_max = max([p[1] for p in points]) center_x = (x_min + x_max) // 2 center_y = (y_min + y_max) // 2 target_width = 32 * self.upscale target_height = 16 * self.upscale x1 = max(center_x - target_width // 2, 0) y1 = max(center_y - target_height // 2, 0) x2 = x1 + target_width y2 = y1 + target_height if x2 > w: x1 = w - target_width x2 = w if y2 > h: y1 = h - target_height y2 = h return image[y1:y2, x1:x2] def blink_detection_model(self, left_eye, right_eye): left_eye = cv2.cvtColor(left_eye, cv2.COLOR_RGB2GRAY) left_eye = Image.fromarray(left_eye) preds_left = self.pipe(left_eye) if preds_left[0]["label"] == "closeEye": closed_left = preds_left[0]["score"] >= self.blink_confidence else: closed_left = preds_left[1]["score"] >= self.blink_confidence right_eye = cv2.cvtColor(right_eye, cv2.COLOR_RGB2GRAY) right_eye = Image.fromarray(right_eye) preds_right = self.pipe(right_eye) if preds_right[0]["label"] == "closeEye": closed_right = preds_right[0]["score"] >= self.blink_confidence else: closed_right = preds_right[1]["score"] >= self.blink_confidence # print("preds_left = ", preds_left) # print("preds_right = ", preds_right) return closed_left or closed_right def extract_eyes(self, image, blink_detection=False): tmp_face = image.copy() results = self.face_mesh.process(tmp_face) if results.multi_face_landmarks is None: return None face_landmarks = results.multi_face_landmarks[0].landmark left_eye = self.extract_eyes_regions(image, face_landmarks, self.LEFT_EYE) right_eye = self.extract_eyes_regions(image, face_landmarks, self.RIGHT_EYE) blinked = False eyes_ratio = None if blink_detection: mesh_coordinates = self.landmarksDetection(image, results, False) eyes_ratio = self.blinkRatio(mesh_coordinates, self.RIGHT_EYE, self.LEFT_EYE) if eyes_ratio > self.blink_lower_thresh and eyes_ratio <= self.blink_upper_thresh: # print( # "I think person blinked. eyes_ratio = ", # eyes_ratio, # "Confirming with ViT model...", # ) blinked = self.blink_detection_model(left_eye=left_eye, right_eye=right_eye) # if blinked: # print("Yes, person blinked. Confirmed by model") # else: # print("No, person didn't blinked. False Alarm") elif eyes_ratio <= self.blink_lower_thresh: blinked = True # print("Surely person blinked. eyes_ratio = ", eyes_ratio) else: blinked = False return {"left_eye": left_eye, "right_eye": right_eye, "blinked": blinked, "eyes_ratio": eyes_ratio} @staticmethod def segment_iris(iris_img): # Convert RGB image to grayscale iris_img_gray = cv2.cvtColor(iris_img, cv2.COLOR_RGB2GRAY) # Apply Gaussian blur for denoising iris_img_blur = cv2.GaussianBlur(iris_img_gray, (5, 5), 0) # Perform adaptive thresholding _, iris_img_mask = cv2.threshold(iris_img_blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # Invert the mask segmented_mask = cv2.bitwise_not(iris_img_mask) segmented_mask = cv2.cvtColor(segmented_mask, cv2.COLOR_GRAY2RGB) segmented_iris = cv2.bitwise_and(iris_img, segmented_mask) return { "segmented_iris": segmented_iris, "segmented_mask": segmented_mask, } def extract_iris(self, image): ih, iw, _ = image.shape tmp_face = image.copy() results = self.face_mesh.process(tmp_face) if results.multi_face_landmarks is None: return None mesh_coordinates = self.landmarksDetection(image, results, False) mesh_points = np.array(mesh_coordinates) (l_cx, l_cy), l_radius = cv2.minEnclosingCircle(mesh_points[self.LEFT_IRIS]) (r_cx, r_cy), r_radius = cv2.minEnclosingCircle(mesh_points[self.RIGHT_IRIS]) # Crop the left iris to be exactly 16*upscaled x 16*upscaled l_x1 = max(int(l_cx) - (8 * self.upscale), 0) l_y1 = max(int(l_cy) - (8 * self.upscale), 0) l_x2 = min(int(l_cx) + (8 * self.upscale), iw) l_y2 = min(int(l_cy) + (8 * self.upscale), ih) cropped_left_iris = image[l_y1:l_y2, l_x1:l_x2] left_iris_segmented_data = self.segment_iris(cv2.cvtColor(cropped_left_iris, cv2.COLOR_BGR2RGB)) # Crop the right iris to be exactly 16*upscaled x 16*upscaled r_x1 = max(int(r_cx) - (8 * self.upscale), 0) r_y1 = max(int(r_cy) - (8 * self.upscale), 0) r_x2 = min(int(r_cx) + (8 * self.upscale), iw) r_y2 = min(int(r_cy) + (8 * self.upscale), ih) cropped_right_iris = image[r_y1:r_y2, r_x1:r_x2] right_iris_segmented_data = self.segment_iris(cv2.cvtColor(cropped_right_iris, cv2.COLOR_BGR2RGB)) return { "left_iris": { "img": cropped_left_iris, "segmented_iris": left_iris_segmented_data["segmented_iris"], "segmented_mask": left_iris_segmented_data["segmented_mask"], }, "right_iris": { "img": cropped_right_iris, "segmented_iris": right_iris_segmented_data["segmented_iris"], "segmented_mask": right_iris_segmented_data["segmented_mask"], }, }