Spaces:
Running
Running
import cv2 | |
import torch | |
import warnings | |
import numpy as np | |
from PIL import Image | |
from math import sqrt | |
import mediapipe as mp | |
from transformers import pipeline | |
warnings.filterwarnings("ignore") | |
class ExtractorMediaPipe: | |
def __init__(self, upscale=1): | |
self.upscale = int(upscale) | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# ========== Face Extraction ========== | |
self.face_detector = mp.solutions.face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5) | |
self.face_mesh = mp.solutions.face_mesh.FaceMesh( | |
max_num_faces=1, | |
static_image_mode=True, | |
refine_landmarks=True, | |
min_detection_confidence=0.5, | |
min_tracking_confidence=0.5, | |
) | |
# ========== Eyes Extraction ========== | |
self.RIGHT_EYE = [ | |
362, | |
382, | |
381, | |
380, | |
374, | |
373, | |
390, | |
249, | |
263, | |
466, | |
388, | |
387, | |
386, | |
385, | |
384, | |
398, | |
] | |
self.LEFT_EYE = [ | |
33, | |
7, | |
163, | |
144, | |
145, | |
153, | |
154, | |
155, | |
133, | |
173, | |
157, | |
158, | |
159, | |
160, | |
161, | |
246, | |
] | |
# https://huggingface.co/dima806/closed_eyes_image_detection | |
# https://www.kaggle.com/code/dima806/closed-eye-image-detection-vit | |
self.pipe = pipeline( | |
"image-classification", | |
model="dima806/closed_eyes_image_detection", | |
device=self.device, | |
) | |
self.blink_lower_thresh = 0.22 | |
self.blink_upper_thresh = 0.25 | |
self.blink_confidence = 0.50 | |
# ========== Iris Extraction ========== | |
self.RIGHT_IRIS = [474, 475, 476, 477] | |
self.LEFT_IRIS = [469, 470, 471, 472] | |
def extract_face(self, image): | |
tmp_image = image.copy() | |
results = self.face_detector.process(tmp_image) | |
if not results.detections: | |
# print("No face detected") | |
return None | |
else: | |
bboxC = results.detections[0].location_data.relative_bounding_box | |
ih, iw, _ = image.shape | |
# Get bounding box coordinates | |
x, y, w, h = ( | |
int(bboxC.xmin * iw), | |
int(bboxC.ymin * ih), | |
int(bboxC.width * iw), | |
int(bboxC.height * ih), | |
) | |
# Calculate the center of the bounding box | |
center_x = x + w // 2 | |
center_y = y + h // 2 | |
# Calculate new bounds ensuring they fit within the image dimensions | |
half_size = 128 * self.upscale | |
x1 = max(center_x - half_size, 0) | |
y1 = max(center_y - half_size, 0) | |
x2 = min(center_x + half_size, iw) | |
y2 = min(center_y + half_size, ih) | |
# Adjust x1, x2, y1, and y2 to ensure the cropped region is exactly (256 * self.upscale) x (256 * self.upscale) | |
if x2 - x1 < (256 * self.upscale): | |
if x1 == 0: | |
x2 = min((256 * self.upscale), iw) | |
elif x2 == iw: | |
x1 = max(iw - (256 * self.upscale), 0) | |
if y2 - y1 < (256 * self.upscale): | |
if y1 == 0: | |
y2 = min((256 * self.upscale), ih) | |
elif y2 == ih: | |
y1 = max(ih - (256 * self.upscale), 0) | |
cropped_face = image[y1:y2, x1:x2] | |
# bicubic upsampling | |
# if self.upscale != 1: | |
# cropped_face = cv2.resize( | |
# cropped_face, | |
# (256 * self.upscale, 256 * self.upscale), | |
# interpolation=cv2.INTER_CUBIC, | |
# ) | |
return cropped_face | |
def landmarksDetection(image, results, draw=False): | |
image_height, image_width = image.shape[:2] | |
mesh_coordinates = [ | |
(int(point.x * image_width), int(point.y * image_height)) | |
for point in results.multi_face_landmarks[0].landmark | |
] | |
if draw: | |
[cv2.circle(image, i, 2, (0, 255, 0), -1) for i in mesh_coordinates] | |
return mesh_coordinates | |
def euclideanDistance(point, point1): | |
x, y = point | |
x1, y1 = point1 | |
distance = sqrt((x1 - x) ** 2 + (y1 - y) ** 2) | |
return distance | |
def blinkRatio(self, landmarks, right_indices, left_indices): | |
right_eye_landmark1 = landmarks[right_indices[0]] | |
right_eye_landmark2 = landmarks[right_indices[8]] | |
right_eye_landmark3 = landmarks[right_indices[12]] | |
right_eye_landmark4 = landmarks[right_indices[4]] | |
left_eye_landmark1 = landmarks[left_indices[0]] | |
left_eye_landmark2 = landmarks[left_indices[8]] | |
left_eye_landmark3 = landmarks[left_indices[12]] | |
left_eye_landmark4 = landmarks[left_indices[4]] | |
right_eye_horizontal_distance = self.euclideanDistance(right_eye_landmark1, right_eye_landmark2) | |
right_eye_vertical_distance = self.euclideanDistance(right_eye_landmark3, right_eye_landmark4) | |
left_eye_vertical_distance = self.euclideanDistance(left_eye_landmark3, left_eye_landmark4) | |
left_eye_horizontal_distance = self.euclideanDistance(left_eye_landmark1, left_eye_landmark2) | |
right_eye_ratio = right_eye_vertical_distance / right_eye_horizontal_distance | |
left_eye_ratio = left_eye_vertical_distance / left_eye_horizontal_distance | |
eyes_ratio = (right_eye_ratio + left_eye_ratio) / 2 | |
return eyes_ratio | |
def extract_eyes_regions(self, image, landmarks, eye_indices): | |
h, w, _ = image.shape | |
points = [(int(landmarks[idx].x * w), int(landmarks[idx].y * h)) for idx in eye_indices] | |
x_min = min([p[0] for p in points]) | |
x_max = max([p[0] for p in points]) | |
y_min = min([p[1] for p in points]) | |
y_max = max([p[1] for p in points]) | |
center_x = (x_min + x_max) // 2 | |
center_y = (y_min + y_max) // 2 | |
target_width = 32 * self.upscale | |
target_height = 16 * self.upscale | |
x1 = max(center_x - target_width // 2, 0) | |
y1 = max(center_y - target_height // 2, 0) | |
x2 = x1 + target_width | |
y2 = y1 + target_height | |
if x2 > w: | |
x1 = w - target_width | |
x2 = w | |
if y2 > h: | |
y1 = h - target_height | |
y2 = h | |
return image[y1:y2, x1:x2] | |
def blink_detection_model(self, left_eye, right_eye): | |
left_eye = cv2.cvtColor(left_eye, cv2.COLOR_RGB2GRAY) | |
left_eye = Image.fromarray(left_eye) | |
preds_left = self.pipe(left_eye) | |
if preds_left[0]["label"] == "closeEye": | |
closed_left = preds_left[0]["score"] >= self.blink_confidence | |
else: | |
closed_left = preds_left[1]["score"] >= self.blink_confidence | |
right_eye = cv2.cvtColor(right_eye, cv2.COLOR_RGB2GRAY) | |
right_eye = Image.fromarray(right_eye) | |
preds_right = self.pipe(right_eye) | |
if preds_right[0]["label"] == "closeEye": | |
closed_right = preds_right[0]["score"] >= self.blink_confidence | |
else: | |
closed_right = preds_right[1]["score"] >= self.blink_confidence | |
# print("preds_left = ", preds_left) | |
# print("preds_right = ", preds_right) | |
return closed_left or closed_right | |
def extract_eyes(self, image, blink_detection=False): | |
tmp_face = image.copy() | |
results = self.face_mesh.process(tmp_face) | |
if results.multi_face_landmarks is None: | |
return None | |
face_landmarks = results.multi_face_landmarks[0].landmark | |
left_eye = self.extract_eyes_regions(image, face_landmarks, self.LEFT_EYE) | |
right_eye = self.extract_eyes_regions(image, face_landmarks, self.RIGHT_EYE) | |
blinked = False | |
eyes_ratio = None | |
if blink_detection: | |
mesh_coordinates = self.landmarksDetection(image, results, False) | |
eyes_ratio = self.blinkRatio(mesh_coordinates, self.RIGHT_EYE, self.LEFT_EYE) | |
if eyes_ratio > self.blink_lower_thresh and eyes_ratio <= self.blink_upper_thresh: | |
# print( | |
# "I think person blinked. eyes_ratio = ", | |
# eyes_ratio, | |
# "Confirming with ViT model...", | |
# ) | |
blinked = self.blink_detection_model(left_eye=left_eye, right_eye=right_eye) | |
# if blinked: | |
# print("Yes, person blinked. Confirmed by model") | |
# else: | |
# print("No, person didn't blinked. False Alarm") | |
elif eyes_ratio <= self.blink_lower_thresh: | |
blinked = True | |
# print("Surely person blinked. eyes_ratio = ", eyes_ratio) | |
else: | |
blinked = False | |
return {"left_eye": left_eye, "right_eye": right_eye, "blinked": blinked, "eyes_ratio": eyes_ratio} | |
def segment_iris(iris_img): | |
# Convert RGB image to grayscale | |
iris_img_gray = cv2.cvtColor(iris_img, cv2.COLOR_RGB2GRAY) | |
# Apply Gaussian blur for denoising | |
iris_img_blur = cv2.GaussianBlur(iris_img_gray, (5, 5), 0) | |
# Perform adaptive thresholding | |
_, iris_img_mask = cv2.threshold(iris_img_blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
# Invert the mask | |
segmented_mask = cv2.bitwise_not(iris_img_mask) | |
segmented_mask = cv2.cvtColor(segmented_mask, cv2.COLOR_GRAY2RGB) | |
segmented_iris = cv2.bitwise_and(iris_img, segmented_mask) | |
return { | |
"segmented_iris": segmented_iris, | |
"segmented_mask": segmented_mask, | |
} | |
def extract_iris(self, image): | |
ih, iw, _ = image.shape | |
tmp_face = image.copy() | |
results = self.face_mesh.process(tmp_face) | |
if results.multi_face_landmarks is None: | |
return None | |
mesh_coordinates = self.landmarksDetection(image, results, False) | |
mesh_points = np.array(mesh_coordinates) | |
(l_cx, l_cy), l_radius = cv2.minEnclosingCircle(mesh_points[self.LEFT_IRIS]) | |
(r_cx, r_cy), r_radius = cv2.minEnclosingCircle(mesh_points[self.RIGHT_IRIS]) | |
# Crop the left iris to be exactly 16*upscaled x 16*upscaled | |
l_x1 = max(int(l_cx) - (8 * self.upscale), 0) | |
l_y1 = max(int(l_cy) - (8 * self.upscale), 0) | |
l_x2 = min(int(l_cx) + (8 * self.upscale), iw) | |
l_y2 = min(int(l_cy) + (8 * self.upscale), ih) | |
cropped_left_iris = image[l_y1:l_y2, l_x1:l_x2] | |
left_iris_segmented_data = self.segment_iris(cv2.cvtColor(cropped_left_iris, cv2.COLOR_BGR2RGB)) | |
# Crop the right iris to be exactly 16*upscaled x 16*upscaled | |
r_x1 = max(int(r_cx) - (8 * self.upscale), 0) | |
r_y1 = max(int(r_cy) - (8 * self.upscale), 0) | |
r_x2 = min(int(r_cx) + (8 * self.upscale), iw) | |
r_y2 = min(int(r_cy) + (8 * self.upscale), ih) | |
cropped_right_iris = image[r_y1:r_y2, r_x1:r_x2] | |
right_iris_segmented_data = self.segment_iris(cv2.cvtColor(cropped_right_iris, cv2.COLOR_BGR2RGB)) | |
return { | |
"left_iris": { | |
"img": cropped_left_iris, | |
"segmented_iris": left_iris_segmented_data["segmented_iris"], | |
"segmented_mask": left_iris_segmented_data["segmented_mask"], | |
}, | |
"right_iris": { | |
"img": cropped_right_iris, | |
"segmented_iris": right_iris_segmented_data["segmented_iris"], | |
"segmented_mask": right_iris_segmented_data["segmented_mask"], | |
}, | |
} | |