import cv2 import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from PIL import Image from torchvision.models._utils import IntermediateLayerGetter as IntermediateLayerGetter from fooocus_extras.facexlib.detection.align_trans import get_reference_facial_points, warp_and_crop_face from fooocus_extras.facexlib.detection.retinaface_net import FPN, SSH, MobileNetV1, make_bbox_head, make_class_head, make_landmark_head from fooocus_extras.facexlib.detection.retinaface_utils import (PriorBox, batched_decode, batched_decode_landm, decode, decode_landm, py_cpu_nms) def generate_config(network_name): cfg_mnet = { 'name': 'mobilenet0.25', 'min_sizes': [[16, 32], [64, 128], [256, 512]], 'steps': [8, 16, 32], 'variance': [0.1, 0.2], 'clip': False, 'loc_weight': 2.0, 'gpu_train': True, 'batch_size': 32, 'ngpu': 1, 'epoch': 250, 'decay1': 190, 'decay2': 220, 'image_size': 640, 'return_layers': { 'stage1': 1, 'stage2': 2, 'stage3': 3 }, 'in_channel': 32, 'out_channel': 64 } cfg_re50 = { 'name': 'Resnet50', 'min_sizes': [[16, 32], [64, 128], [256, 512]], 'steps': [8, 16, 32], 'variance': [0.1, 0.2], 'clip': False, 'loc_weight': 2.0, 'gpu_train': True, 'batch_size': 24, 'ngpu': 4, 'epoch': 100, 'decay1': 70, 'decay2': 90, 'image_size': 840, 'return_layers': { 'layer2': 1, 'layer3': 2, 'layer4': 3 }, 'in_channel': 256, 'out_channel': 256 } if network_name == 'mobile0.25': return cfg_mnet elif network_name == 'resnet50': return cfg_re50 else: raise NotImplementedError(f'network_name={network_name}') class RetinaFace(nn.Module): def __init__(self, network_name='resnet50', half=False, phase='test', device=None): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device super(RetinaFace, self).__init__() self.half_inference = half cfg = generate_config(network_name) self.backbone = cfg['name'] self.model_name = f'retinaface_{network_name}' self.cfg = cfg self.phase = phase self.target_size, self.max_size = 1600, 2150 self.resize, self.scale, self.scale1 = 1., None, None self.mean_tensor = torch.tensor([[[[104.]], [[117.]], [[123.]]]], device=self.device) self.reference = get_reference_facial_points(default_square=True) # Build network. backbone = None if cfg['name'] == 'mobilenet0.25': backbone = MobileNetV1() self.body = IntermediateLayerGetter(backbone, cfg['return_layers']) elif cfg['name'] == 'Resnet50': import torchvision.models as models backbone = models.resnet50(weights=None) self.body = IntermediateLayerGetter(backbone, cfg['return_layers']) in_channels_stage2 = cfg['in_channel'] in_channels_list = [ in_channels_stage2 * 2, in_channels_stage2 * 4, in_channels_stage2 * 8, ] out_channels = cfg['out_channel'] self.fpn = FPN(in_channels_list, out_channels) self.ssh1 = SSH(out_channels, out_channels) self.ssh2 = SSH(out_channels, out_channels) self.ssh3 = SSH(out_channels, out_channels) self.ClassHead = make_class_head(fpn_num=3, inchannels=cfg['out_channel']) self.BboxHead = make_bbox_head(fpn_num=3, inchannels=cfg['out_channel']) self.LandmarkHead = make_landmark_head(fpn_num=3, inchannels=cfg['out_channel']) self.to(self.device) self.eval() if self.half_inference: self.half() def forward(self, inputs): out = self.body(inputs) if self.backbone == 'mobilenet0.25' or self.backbone == 'Resnet50': out = list(out.values()) # FPN fpn = self.fpn(out) # SSH feature1 = self.ssh1(fpn[0]) feature2 = self.ssh2(fpn[1]) feature3 = self.ssh3(fpn[2]) features = [feature1, feature2, feature3] bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1) classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1) tmp = [self.LandmarkHead[i](feature) for i, feature in enumerate(features)] ldm_regressions = (torch.cat(tmp, dim=1)) if self.phase == 'train': output = (bbox_regressions, classifications, ldm_regressions) else: output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions) return output def __detect_faces(self, inputs): # get scale height, width = inputs.shape[2:] self.scale = torch.tensor([width, height, width, height], dtype=torch.float32, device=self.device) tmp = [width, height, width, height, width, height, width, height, width, height] self.scale1 = torch.tensor(tmp, dtype=torch.float32, device=self.device) # forawrd inputs = inputs.to(self.device) if self.half_inference: inputs = inputs.half() loc, conf, landmarks = self(inputs) # get priorbox priorbox = PriorBox(self.cfg, image_size=inputs.shape[2:]) priors = priorbox.forward().to(self.device) return loc, conf, landmarks, priors # single image detection def transform(self, image, use_origin_size): # convert to opencv format if isinstance(image, Image.Image): image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) image = image.astype(np.float32) # testing scale im_size_min = np.min(image.shape[0:2]) im_size_max = np.max(image.shape[0:2]) resize = float(self.target_size) / float(im_size_min) # prevent bigger axis from being more than max_size if np.round(resize * im_size_max) > self.max_size: resize = float(self.max_size) / float(im_size_max) resize = 1 if use_origin_size else resize # resize if resize != 1: image = cv2.resize(image, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) # convert to torch.tensor format # image -= (104, 117, 123) image = image.transpose(2, 0, 1) image = torch.from_numpy(image).unsqueeze(0) return image, resize def detect_faces( self, image, conf_threshold=0.8, nms_threshold=0.4, use_origin_size=True, ): image, self.resize = self.transform(image, use_origin_size) image = image.to(self.device) if self.half_inference: image = image.half() image = image - self.mean_tensor loc, conf, landmarks, priors = self.__detect_faces(image) boxes = decode(loc.data.squeeze(0), priors.data, self.cfg['variance']) boxes = boxes * self.scale / self.resize boxes = boxes.cpu().numpy() scores = conf.squeeze(0).data.cpu().numpy()[:, 1] landmarks = decode_landm(landmarks.squeeze(0), priors, self.cfg['variance']) landmarks = landmarks * self.scale1 / self.resize landmarks = landmarks.cpu().numpy() # ignore low scores inds = np.where(scores > conf_threshold)[0] boxes, landmarks, scores = boxes[inds], landmarks[inds], scores[inds] # sort order = scores.argsort()[::-1] boxes, landmarks, scores = boxes[order], landmarks[order], scores[order] # do NMS bounding_boxes = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) keep = py_cpu_nms(bounding_boxes, nms_threshold) bounding_boxes, landmarks = bounding_boxes[keep, :], landmarks[keep] # self.t['forward_pass'].toc() # print(self.t['forward_pass'].average_time) # import sys # sys.stdout.flush() return np.concatenate((bounding_boxes, landmarks), axis=1) def __align_multi(self, image, boxes, landmarks, limit=None): if len(boxes) < 1: return [], [] if limit: boxes = boxes[:limit] landmarks = landmarks[:limit] faces = [] for landmark in landmarks: facial5points = [[landmark[2 * j], landmark[2 * j + 1]] for j in range(5)] warped_face = warp_and_crop_face(np.array(image), facial5points, self.reference, crop_size=(112, 112)) faces.append(warped_face) return np.concatenate((boxes, landmarks), axis=1), faces def align_multi(self, img, conf_threshold=0.8, limit=None): rlt = self.detect_faces(img, conf_threshold=conf_threshold) boxes, landmarks = rlt[:, 0:5], rlt[:, 5:] return self.__align_multi(img, boxes, landmarks, limit) # batched detection def batched_transform(self, frames, use_origin_size): """ Arguments: frames: a list of PIL.Image, or torch.Tensor(shape=[n, h, w, c], type=np.float32, BGR format). use_origin_size: whether to use origin size. """ from_PIL = True if isinstance(frames[0], Image.Image) else False # convert to opencv format if from_PIL: frames = [cv2.cvtColor(np.asarray(frame), cv2.COLOR_RGB2BGR) for frame in frames] frames = np.asarray(frames, dtype=np.float32) # testing scale im_size_min = np.min(frames[0].shape[0:2]) im_size_max = np.max(frames[0].shape[0:2]) resize = float(self.target_size) / float(im_size_min) # prevent bigger axis from being more than max_size if np.round(resize * im_size_max) > self.max_size: resize = float(self.max_size) / float(im_size_max) resize = 1 if use_origin_size else resize # resize if resize != 1: if not from_PIL: frames = F.interpolate(frames, scale_factor=resize) else: frames = [ cv2.resize(frame, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) for frame in frames ] # convert to torch.tensor format if not from_PIL: frames = frames.transpose(1, 2).transpose(1, 3).contiguous() else: frames = frames.transpose((0, 3, 1, 2)) frames = torch.from_numpy(frames) return frames, resize def batched_detect_faces(self, frames, conf_threshold=0.8, nms_threshold=0.4, use_origin_size=True): """ Arguments: frames: a list of PIL.Image, or np.array(shape=[n, h, w, c], type=np.uint8, BGR format). conf_threshold: confidence threshold. nms_threshold: nms threshold. use_origin_size: whether to use origin size. Returns: final_bounding_boxes: list of np.array ([n_boxes, 5], type=np.float32). final_landmarks: list of np.array ([n_boxes, 10], type=np.float32). """ # self.t['forward_pass'].tic() frames, self.resize = self.batched_transform(frames, use_origin_size) frames = frames.to(self.device) frames = frames - self.mean_tensor b_loc, b_conf, b_landmarks, priors = self.__detect_faces(frames) final_bounding_boxes, final_landmarks = [], [] # decode priors = priors.unsqueeze(0) b_loc = batched_decode(b_loc, priors, self.cfg['variance']) * self.scale / self.resize b_landmarks = batched_decode_landm(b_landmarks, priors, self.cfg['variance']) * self.scale1 / self.resize b_conf = b_conf[:, :, 1] # index for selection b_indice = b_conf > conf_threshold # concat b_loc_and_conf = torch.cat((b_loc, b_conf.unsqueeze(-1)), dim=2).float() for pred, landm, inds in zip(b_loc_and_conf, b_landmarks, b_indice): # ignore low scores pred, landm = pred[inds, :], landm[inds, :] if pred.shape[0] == 0: final_bounding_boxes.append(np.array([], dtype=np.float32)) final_landmarks.append(np.array([], dtype=np.float32)) continue # sort # order = score.argsort(descending=True) # box, landm, score = box[order], landm[order], score[order] # to CPU bounding_boxes, landm = pred.cpu().numpy(), landm.cpu().numpy() # NMS keep = py_cpu_nms(bounding_boxes, nms_threshold) bounding_boxes, landmarks = bounding_boxes[keep, :], landm[keep] # append final_bounding_boxes.append(bounding_boxes) final_landmarks.append(landmarks) # self.t['forward_pass'].toc(average=True) # self.batch_time += self.t['forward_pass'].diff # self.total_frame += len(frames) # print(self.batch_time / self.total_frame) return final_bounding_boxes, final_landmarks