Fhdnejd

Sleeping

App Files Files Community

Ggh596 commited on Dec 2, 2024

Commit

5494b59

verified ·

1 Parent(s): 752438f

Upload 18 files

Browse files

Files changed (11) hide show

roop/ProcessMgr.py +223 -27
roop/ProcessOptions.py +5 -2
roop/StreamWriter.py +60 -0
roop/capturer.py +21 -5
roop/core.py +40 -14
roop/face_util.py +11 -8
roop/globals.py +3 -0
roop/metadata.py +1 -1
roop/util_ffmpeg.py +22 -2
roop/utilities.py +39 -0
roop/virtualcam.py +8 -7

roop/ProcessMgr.py CHANGED Viewed

@@ -3,10 +3,9 @@ import cv2
 import numpy as np
 import psutil
-from enum import Enum
 from roop.ProcessOptions import ProcessOptions
-from roop.face_util import get_first_face, get_all_faces, rotate_image_180, rotate_anticlockwise, rotate_clockwise, clamp_cut_values
 from roop.utilities import compute_cosine_distance, get_device, str_to_class
 import roop.vr_util as vr
@@ -17,15 +16,18 @@ from threading import Thread, Lock
 from queue import Queue
 from tqdm import tqdm
 from roop.ffmpeg_writer import FFMPEG_VideoWriter
 import roop.globals
 # Poor man's enum to be able to compare to int
 class eNoFaceAction():
     USE_ORIGINAL_FRAME = 0
     RETRY_ROTATED = 1
     SKIP_FRAME = 2
-    SKIP_FRAME_IF_DISSIMILAR = 3
@@ -44,6 +46,7 @@ def pick_queue(queue: Queue[str], queue_per_future: int) -> List[str]:
     return queues
 class ProcessMgr():
     input_face_datas = []
     target_face_datas = []
@@ -64,11 +67,16 @@ class ProcessMgr():
     processed_queue = None
     videowriter= None
     progress_gradio = None
     total_frames = 0
     plugins =  {
@@ -101,6 +109,8 @@ class ProcessMgr():
     def initialize(self, input_faces, target_faces, options):
         self.input_face_datas = input_faces
         self.target_face_datas = target_faces
         self.options = options
         devicename = get_device()
@@ -185,7 +195,8 @@ class ProcessMgr():
                     resimg = self.process_frame(temp_frame)
                 if resimg is not None:
                     i = source_files.index(f)
-                    cv2.imwrite(target_files[i], resimg)
             if update:
                 update()
@@ -239,7 +250,10 @@ class ProcessMgr():
             process, frame = self.processed_queue[nextindex % self.num_threads].get()
             nextindex += 1
             if frame is not None:
-                self.videowriter.write_frame(frame)
                 del frame
             elif process == False:
                 num_producers -= 1
@@ -248,7 +262,11 @@ class ProcessMgr():
-    def run_batch_inmem(self, source_video, target_video, frame_start, frame_end, fps, threads:int = 1, skip_audio=False):
         cap = cv2.VideoCapture(source_video)
         # frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         frame_count = (frame_end - frame_start) + 1
@@ -275,7 +293,13 @@ class ProcessMgr():
             self.frames_queue.append(Queue(1))
             self.processed_queue.append(Queue(1))
-        self.videowriter =  FFMPEG_VideoWriter(target_video, (width, height), fps, codec=roop.globals.video_encoder, crf=roop.globals.video_quality, audiofile=None)
         readthread = Thread(target=self.read_frames_thread, args=(cap, frame_start, frame_end, threads))
         readthread.start()
@@ -298,7 +322,11 @@ class ProcessMgr():
         readthread.join()
         writethread.join()
         cap.release()
-        self.videowriter.close()
         self.frames_queue.clear()
         self.processed_queue.clear()
@@ -317,11 +345,6 @@ class ProcessMgr():
             self.progress_gradio((progress.n, self.total_frames), desc='Processing', total=self.total_frames, unit='frames')
-# https://github.com/deepinsight/insightface#third-party-re-implementation-of-arcface
-# https://github.com/deepinsight/insightface/blob/master/alignment/coordinate_reg/image_infer.py
-# https://github.com/deepinsight/insightface/issues/1350
-# https://github.com/linghu8812/tensorrt_inference
     def process_frame(self, frame:Frame):
         if len(self.input_face_datas) < 1 and not self.options.show_face_masking:
@@ -332,8 +355,16 @@ class ProcessMgr():
             if roop.globals.no_face_action == eNoFaceAction.SKIP_FRAME_IF_DISSIMILAR:
                 if len(self.input_face_datas) > num_swapped:
                     return None
             return temp_frame
-        if roop.globals.no_face_action == eNoFaceAction.USE_ORIGINAL_FRAME:
             return frame
         if roop.globals.no_face_action == eNoFaceAction.SKIP_FRAME:
             #This only works with in-mem processing, as it simply skips the frame.
@@ -374,6 +405,8 @@ class ProcessMgr():
             num_faces_found += 1
             temp_frame = self.process_face(self.options.selected_index, face, temp_frame)
         else:
             faces = get_all_faces(frame)
             if faces is None:
@@ -383,7 +416,14 @@ class ProcessMgr():
                 for face in faces:
                     num_faces_found += 1
                     temp_frame = self.process_face(self.options.selected_index, face, temp_frame)
-                    del face
             elif self.options.swap_mode == "selected":
                 num_targetfaces = len(self.target_face_datas)
@@ -397,7 +437,6 @@ class ProcessMgr():
                                 else:
                                     temp_frame = self.process_face(i, face, temp_frame)
                                 num_faces_found += 1
-                            del face
                             if not roop.globals.vr_mode and num_faces_found == num_targetfaces:
                                 break
             elif self.options.swap_mode == "all_female" or self.options.swap_mode == "all_male":
@@ -406,7 +445,13 @@ class ProcessMgr():
                     if face.sex == gender:
                         num_faces_found += 1
                         temp_frame = self.process_face(self.options.selected_index, face, temp_frame)
-                    del face
         if roop.globals.vr_mode and num_faces_found % 2 > 0:
             # stereo image, there has to be an even number of faces
@@ -541,17 +586,31 @@ class ProcessMgr():
             # img = vr.GetPerspective(frame, 90, theta, phi, 1280, 1280)  # Generate perspective image
-        fake_frame = None
-        aligned_img, M = align_crop(frame, target_face.kps, 128)
         fake_frame = aligned_img
-        swap_frame = aligned_img
         target_face.matrix = M
         for p in self.processors:
             if p.type == 'swap':
-                if inputface is not None:
                     for _ in range(0,self.options.num_swap_steps):
-                        swap_frame = p.Run(inputface, target_face, swap_frame)
-                fake_frame = swap_frame
                 scale_factor = 0.0
             elif p.type == 'mask':
                 fake_frame = self.process_mask(p, aligned_img, fake_frame)
@@ -560,8 +619,8 @@ class ProcessMgr():
         upscale = 512
         orig_width = fake_frame.shape[1]
-        fake_frame = cv2.resize(fake_frame, (upscale, upscale), cv2.INTER_CUBIC)
         mask_offsets = (0,0,0,0,1,20) if inputface is None else inputface.mask_offsets
@@ -571,9 +630,14 @@ class ProcessMgr():
         else:
             result = self.paste_upscale(fake_frame, enhanced_frame, target_face.matrix, frame, scale_factor, mask_offsets)
         if rotation_action is not None:
             fake_frame = self.auto_unrotate_frame(result, rotation_action)
-            return self.paste_simple(fake_frame, saved_frame, startX, startY)
         return result
@@ -673,6 +737,43 @@ class ProcessMgr():
         return cv2.GaussianBlur(img_matte, blur_size, 0)
     def process_mask(self, processor, frame:Frame, target:Frame):
         img_mask = processor.Run(frame, self.options.masking_text)
         img_mask = cv2.resize(img_mask, (target.shape[1], target.shape[0]))
@@ -688,7 +789,98 @@ class ProcessMgr():
         result += img_mask * frame.astype(np.float32)
         return np.uint8(result)
     def unload_models():
@@ -699,4 +891,8 @@ class ProcessMgr():
         for p in self.processors:
             p.Release()
         self.processors.clear()

 import numpy as np
 import psutil
 from roop.ProcessOptions import ProcessOptions
+from roop.face_util import get_first_face, get_all_faces, rotate_anticlockwise, rotate_clockwise, clamp_cut_values
 from roop.utilities import compute_cosine_distance, get_device, str_to_class
 import roop.vr_util as vr
 from queue import Queue
 from tqdm import tqdm
 from roop.ffmpeg_writer import FFMPEG_VideoWriter
+from roop.StreamWriter import StreamWriter
 import roop.globals
 # Poor man's enum to be able to compare to int
 class eNoFaceAction():
     USE_ORIGINAL_FRAME = 0
     RETRY_ROTATED = 1
     SKIP_FRAME = 2
+    SKIP_FRAME_IF_DISSIMILAR = 3,
+    USE_LAST_SWAPPED = 4
     return queues
 class ProcessMgr():
     input_face_datas = []
     target_face_datas = []
     processed_queue = None
     videowriter= None
+    streamwriter = None
     progress_gradio = None
     total_frames = 0
+    num_frames_no_face = 0
+    last_swapped_frame = None
+    output_to_file = None
+    output_to_cam = None
     plugins =  {
     def initialize(self, input_faces, target_faces, options):
         self.input_face_datas = input_faces
         self.target_face_datas = target_faces
+        self.num_frames_no_face = 0
+        self.last_swapped_frame = None
         self.options = options
         devicename = get_device()
                     resimg = self.process_frame(temp_frame)
                 if resimg is not None:
                     i = source_files.index(f)
+                    # Also let numpy write the file to support utf-8/16 filenames
+                    cv2.imencode(f'.{roop.globals.CFG.output_image_format}',resimg)[1].tofile(target_files[i])
             if update:
                 update()
             process, frame = self.processed_queue[nextindex % self.num_threads].get()
             nextindex += 1
             if frame is not None:
+                if self.output_to_file:
+                    self.videowriter.write_frame(frame)
+                if self.output_to_cam:
+                    self.streamwriter.WriteToStream(frame)
                 del frame
             elif process == False:
                 num_producers -= 1
+    def run_batch_inmem(self, output_method, source_video, target_video, frame_start, frame_end, fps, threads:int = 1):
+        if len(self.processors) < 1:
+            print("No processor defined!")
+            return
         cap = cv2.VideoCapture(source_video)
         # frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         frame_count = (frame_end - frame_start) + 1
             self.frames_queue.append(Queue(1))
             self.processed_queue.append(Queue(1))
+        self.output_to_file = output_method != "Virtual Camera"
+        self.output_to_cam = output_method == "Virtual Camera" or output_method == "Both"
+        if self.output_to_file:
+            self.videowriter = FFMPEG_VideoWriter(target_video, (width, height), fps, codec=roop.globals.video_encoder, crf=roop.globals.video_quality, audiofile=None)
+        if self.output_to_cam:
+            self.streamwriter = StreamWriter((width, height), int(fps))
         readthread = Thread(target=self.read_frames_thread, args=(cap, frame_start, frame_end, threads))
         readthread.start()
         readthread.join()
         writethread.join()
         cap.release()
+        if self.output_to_file:
+            self.videowriter.close()
+        if self.output_to_cam:
+            self.streamwriter.Close()
         self.frames_queue.clear()
         self.processed_queue.clear()
             self.progress_gradio((progress.n, self.total_frames), desc='Processing', total=self.total_frames, unit='frames')
     def process_frame(self, frame:Frame):
         if len(self.input_face_datas) < 1 and not self.options.show_face_masking:
             if roop.globals.no_face_action == eNoFaceAction.SKIP_FRAME_IF_DISSIMILAR:
                 if len(self.input_face_datas) > num_swapped:
                     return None
+            self.num_frames_no_face = 0
+            self.last_swapped_frame = temp_frame.copy()
             return temp_frame
+        if roop.globals.no_face_action == eNoFaceAction.USE_LAST_SWAPPED:
+            if self.last_swapped_frame is not None and self.num_frames_no_face < self.options.max_num_reuse_frame:
+                self.num_frames_no_face += 1
+                return self.last_swapped_frame.copy()
+            return frame
+        elif roop.globals.no_face_action == eNoFaceAction.USE_ORIGINAL_FRAME:
             return frame
         if roop.globals.no_face_action == eNoFaceAction.SKIP_FRAME:
             #This only works with in-mem processing, as it simply skips the frame.
             num_faces_found += 1
             temp_frame = self.process_face(self.options.selected_index, face, temp_frame)
+            del face
         else:
             faces = get_all_faces(frame)
             if faces is None:
                 for face in faces:
                     num_faces_found += 1
                     temp_frame = self.process_face(self.options.selected_index, face, temp_frame)
+            elif self.options.swap_mode == "all_input":
+                for i,face in enumerate(faces):
+                    num_faces_found += 1
+                    if i < len(self.input_face_datas):
+                        temp_frame = self.process_face(i, face, temp_frame)
+                    else:
+                        break
             elif self.options.swap_mode == "selected":
                 num_targetfaces = len(self.target_face_datas)
                                 else:
                                     temp_frame = self.process_face(i, face, temp_frame)
                                 num_faces_found += 1
                             if not roop.globals.vr_mode and num_faces_found == num_targetfaces:
                                 break
             elif self.options.swap_mode == "all_female" or self.options.swap_mode == "all_male":
                     if face.sex == gender:
                         num_faces_found += 1
                         temp_frame = self.process_face(self.options.selected_index, face, temp_frame)
+            # might be slower but way more clean to release everything here
+            for face in faces:
+                del face
+            faces.clear()
         if roop.globals.vr_mode and num_faces_found % 2 > 0:
             # stereo image, there has to be an even number of faces
             # img = vr.GetPerspective(frame, 90, theta, phi, 1280, 1280)  # Generate perspective image
+        """ Code ported/adapted from Facefusion which borrowed the idea from Rope:
+            Kind of subsampling the cutout and aligned face image and faceswapping slices of it up to
+            the desired output resolution. This works around the current resolution limitations without using enhancers.
+        """
+        model_output_size = 128
+        subsample_size = self.options.subsample_size
+        subsample_total = subsample_size // model_output_size
+        aligned_img, M = align_crop(frame, target_face.kps, subsample_size)
         fake_frame = aligned_img
         target_face.matrix = M
         for p in self.processors:
             if p.type == 'swap':
+                swap_result_frames = []
+                subsample_frames = self.implode_pixel_boost(aligned_img, model_output_size, subsample_total)
+                for sliced_frame in subsample_frames:
                     for _ in range(0,self.options.num_swap_steps):
+                        sliced_frame = self.prepare_crop_frame(sliced_frame)
+                        sliced_frame = p.Run(inputface, target_face, sliced_frame)
+                        sliced_frame = self.normalize_swap_frame(sliced_frame)
+                    swap_result_frames.append(sliced_frame)
+                fake_frame = self.explode_pixel_boost(swap_result_frames, model_output_size, subsample_total, subsample_size)
+                fake_frame = fake_frame.astype(np.uint8)
                 scale_factor = 0.0
             elif p.type == 'mask':
                 fake_frame = self.process_mask(p, aligned_img, fake_frame)
         upscale = 512
         orig_width = fake_frame.shape[1]
+        if orig_width != upscale:
+            fake_frame = cv2.resize(fake_frame, (upscale, upscale), cv2.INTER_CUBIC)
         mask_offsets = (0,0,0,0,1,20) if inputface is None else inputface.mask_offsets
         else:
             result = self.paste_upscale(fake_frame, enhanced_frame, target_face.matrix, frame, scale_factor, mask_offsets)
+        # Restore mouth before unrotating
+        if self.options.restore_original_mouth:
+            mouth_cutout, mouth_bb = self.create_mouth_mask(target_face, frame)
+            result = self.apply_mouth_area(result, mouth_cutout, mouth_bb)
         if rotation_action is not None:
             fake_frame = self.auto_unrotate_frame(result, rotation_action)
+            result = self.paste_simple(fake_frame, saved_frame, startX, startY)
         return result
         return cv2.GaussianBlur(img_matte, blur_size, 0)
+    def prepare_crop_frame(self, swap_frame):
+        model_type = 'inswapper'
+        model_mean = [0.0, 0.0, 0.0]
+        model_standard_deviation = [1.0, 1.0, 1.0]
+        if model_type == 'ghost':
+            swap_frame = swap_frame[:, :, ::-1] / 127.5 - 1
+        else:
+            swap_frame = swap_frame[:, :, ::-1] / 255.0
+        swap_frame = (swap_frame - model_mean) / model_standard_deviation
+        swap_frame = swap_frame.transpose(2, 0, 1)
+        swap_frame = np.expand_dims(swap_frame, axis = 0).astype(np.float32)
+        return swap_frame
+    def normalize_swap_frame(self, swap_frame):
+        model_type = 'inswapper'
+        swap_frame = swap_frame.transpose(1, 2, 0)
+        if model_type == 'ghost':
+            swap_frame = (swap_frame * 127.5 + 127.5).round()
+        else:
+            swap_frame = (swap_frame * 255.0).round()
+        swap_frame = swap_frame[:, :, ::-1]
+        return swap_frame
+    def implode_pixel_boost(self, aligned_face_frame, model_size, pixel_boost_total : int):
+        subsample_frame = aligned_face_frame.reshape(model_size, pixel_boost_total, model_size, pixel_boost_total, 3)
+        subsample_frame = subsample_frame.transpose(1, 3, 0, 2, 4).reshape(pixel_boost_total ** 2, model_size, model_size, 3)
+        return subsample_frame
+    def explode_pixel_boost(self, subsample_frame, model_size, pixel_boost_total, pixel_boost_size):
+        final_frame = np.stack(subsample_frame, axis = 0).reshape(pixel_boost_total, pixel_boost_total, model_size, model_size, 3)
+        final_frame = final_frame.transpose(2, 0, 3, 1, 4).reshape(pixel_boost_size, pixel_boost_size, 3)
+        return final_frame
     def process_mask(self, processor, frame:Frame, target:Frame):
         img_mask = processor.Run(frame, self.options.masking_text)
         img_mask = cv2.resize(img_mask, (target.shape[1], target.shape[0]))
         result += img_mask * frame.astype(np.float32)
         return np.uint8(result)
+    # Code for mouth restoration adapted from https://github.com/iVideoGameBoss/iRoopDeepFaceCam
+    def create_mouth_mask(self, face: Face, frame: Frame):
+        mouth_cutout = None
+        landmarks = face.landmark_2d_106
+        if landmarks is not None:
+            # Get mouth landmarks (indices 52 to 71 typically represent the outer mouth)
+            mouth_points = landmarks[52:71].astype(np.int32)
+            # Add padding to mouth area
+            min_x, min_y = np.min(mouth_points, axis=0)
+            max_x, max_y = np.max(mouth_points, axis=0)
+            min_x = max(0, min_x - (15*6))
+            min_y = max(0, min_y - 22)
+            max_x = min(frame.shape[1], max_x + (15*6))
+            max_y = min(frame.shape[0], max_y + (90*6))
+            # Extract the mouth area from the frame using the calculated bounding box
+            mouth_cutout = frame[min_y:max_y, min_x:max_x].copy()
+        return mouth_cutout, (min_x, min_y, max_x, max_y)
+    def create_feathered_mask(self, shape, feather_amount=30):
+        mask = np.zeros(shape[:2], dtype=np.float32)
+        center = (shape[1] // 2, shape[0] // 2)
+        cv2.ellipse(mask, center, (shape[1] // 2 - feather_amount, shape[0] // 2 - feather_amount),
+                    0, 0, 360, 1, -1)
+        mask = cv2.GaussianBlur(mask, (feather_amount*2+1, feather_amount*2+1), 0)
+        return mask / np.max(mask)
+    def apply_mouth_area(self, frame: np.ndarray, mouth_cutout: np.ndarray, mouth_box: tuple) -> np.ndarray:
+        min_x, min_y, max_x, max_y = mouth_box
+        box_width = max_x - min_x
+        box_height = max_y - min_y
+        # Resize the mouth cutout to match the mouth box size
+        if mouth_cutout is None or box_width is None or box_height is None:
+            return frame
+        try:
+            resized_mouth_cutout = cv2.resize(mouth_cutout, (box_width, box_height))
+            # Extract the region of interest (ROI) from the target frame
+            roi = frame[min_y:max_y, min_x:max_x]
+            # Ensure the ROI and resized_mouth_cutout have the same shape
+            if roi.shape != resized_mouth_cutout.shape:
+                resized_mouth_cutout = cv2.resize(resized_mouth_cutout, (roi.shape[1], roi.shape[0]))
+            # Apply color transfer from ROI to mouth cutout
+            color_corrected_mouth = self.apply_color_transfer(resized_mouth_cutout, roi)
+            # Create a feathered mask with increased feather amount
+            feather_amount = min(30, box_width // 15, box_height // 15)
+            mask = self.create_feathered_mask(resized_mouth_cutout.shape, feather_amount)
+            # Blend the color-corrected mouth cutout with the ROI using the feathered mask
+            mask = mask[:,:,np.newaxis]  # Add channel dimension to mask
+            blended = (color_corrected_mouth * mask + roi * (1 - mask)).astype(np.uint8)
+            # Place the blended result back into the frame
+            frame[min_y:max_y, min_x:max_x] = blended
+        except Exception as e:
+            print(f'Error {e}')
+            pass
+        return frame
+    def apply_color_transfer(self, source, target):
+        """
+        Apply color transfer from target to source image
+        """
+        source = cv2.cvtColor(source, cv2.COLOR_BGR2LAB).astype("float32")
+        target = cv2.cvtColor(target, cv2.COLOR_BGR2LAB).astype("float32")
+        source_mean, source_std = cv2.meanStdDev(source)
+        target_mean, target_std = cv2.meanStdDev(target)
+        # Reshape mean and std to be broadcastable
+        source_mean = source_mean.reshape(1, 1, 3)
+        source_std = source_std.reshape(1, 1, 3)
+        target_mean = target_mean.reshape(1, 1, 3)
+        target_std = target_std.reshape(1, 1, 3)
+        # Perform the color transfer
+        source = (source - source_mean) * (target_std / source_std) + target_mean
+        return cv2.cvtColor(np.clip(source, 0, 255).astype("uint8"), cv2.COLOR_LAB2BGR)
     def unload_models():
         for p in self.processors:
             p.Release()
         self.processors.clear()
+        if self.videowriter is not None:
+            self.videowriter.close()
+        if self.streamwriter is not None:
+            self.streamwriter.Close()

roop/ProcessOptions.py CHANGED Viewed

@@ -1,6 +1,6 @@
 class ProcessOptions:
-    def __init__(self, processordefines:dict, face_distance,  blend_ratio, swap_mode, selected_index, masking_text, imagemask, num_steps, show_face_area, show_mask=False):
         self.processors = processordefines
         self.face_distance_threshold = face_distance
         self.blend_ratio = blend_ratio
@@ -10,4 +10,7 @@ class ProcessOptions:
         self.imagemask = imagemask
         self.num_swap_steps = num_steps
         self.show_face_area_overlay = show_face_area
-        self.show_face_masking = show_mask

 class ProcessOptions:
+    def __init__(self, processordefines:dict, face_distance,  blend_ratio, swap_mode, selected_index, masking_text, imagemask, num_steps, subsample_size, show_face_area, restore_original_mouth, show_mask=False):
         self.processors = processordefines
         self.face_distance_threshold = face_distance
         self.blend_ratio = blend_ratio
         self.imagemask = imagemask
         self.num_swap_steps = num_steps
         self.show_face_area_overlay = show_face_area
+        self.show_face_masking = show_mask
+        self.subsample_size = subsample_size
+        self.restore_original_mouth = restore_original_mouth
+        self.max_num_reuse_frame = 15

roop/StreamWriter.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import threading
+import time
+import pyvirtualcam
+class StreamWriter():
+    FPS = 30
+    VCam = None
+    Active = False
+    THREAD_LOCK_STREAM = threading.Lock()
+    time_last_process = None
+    timespan_min = 0.0
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.Close()
+    def __init__(self, size, fps):
+        self.time_last_process = time.perf_counter()
+        self.FPS = fps
+        self.timespan_min = 1.0 / fps
+        print('Detecting virtual cam devices')
+        self.VCam = pyvirtualcam.Camera(width=size[0], height=size[1], fps=fps, fmt=pyvirtualcam.PixelFormat.BGR, print_fps=False)
+        if self.VCam is None:
+             print("No virtual camera found!")
+             return
+        print(f'Using virtual camera: {self.VCam.device}')
+        print(f'Using {self.VCam.native_fmt}')
+        self.Active = True
+    def LimitFrames(self):
+        while True:
+            current_time = time.perf_counter()
+            time_passed = current_time - self.time_last_process
+            if time_passed >= self.timespan_min:
+                break
+    # First version used a queue and threading. Surprisingly this
+    # totally simple, blocking version is 10 times faster!
+    def WriteToStream(self, frame):
+        if self.VCam is None:
+             return
+        with self.THREAD_LOCK_STREAM:
+            self.LimitFrames()
+            self.VCam.send(frame)
+            self.time_last_process = time.perf_counter()
+    def Close(self):
+        self.Active = False
+        if self.VCam is None:
+            self.VCam.close()
+            self.VCam = None

roop/capturer.py CHANGED Viewed

@@ -4,6 +4,10 @@ import numpy as np
 from roop.typing import Frame
 def get_image_frame(filename: str):
     try:
         return cv2.imdecode(np.fromfile(filename, dtype=np.uint8), cv2.IMREAD_COLOR)
@@ -13,15 +17,27 @@ def get_image_frame(filename: str):
 def get_video_frame(video_path: str, frame_number: int = 0) -> Optional[Frame]:
-    capture = cv2.VideoCapture(video_path)
-    frame_total = capture.get(cv2.CAP_PROP_FRAME_COUNT)
-    capture.set(cv2.CAP_PROP_POS_FRAMES, min(frame_total, frame_number - 1))
-    has_frame, frame = capture.read()
-    capture.release()
     if has_frame:
         return frame
     return None
 def get_video_frame_total(video_path: str) -> int:
     capture = cv2.VideoCapture(video_path)

 from roop.typing import Frame
+current_video_path = None
+current_frame_total = 0
+current_capture = None
 def get_image_frame(filename: str):
     try:
         return cv2.imdecode(np.fromfile(filename, dtype=np.uint8), cv2.IMREAD_COLOR)
 def get_video_frame(video_path: str, frame_number: int = 0) -> Optional[Frame]:
+    global current_video_path, current_capture, current_frame_total
+    if video_path != current_video_path:
+        release_video()
+        current_capture = cv2.VideoCapture(video_path)
+        current_video_path = video_path
+        current_frame_total = current_capture.get(cv2.CAP_PROP_FRAME_COUNT)
+    current_capture.set(cv2.CAP_PROP_POS_FRAMES, min(current_frame_total, frame_number - 1))
+    has_frame, frame = current_capture.read()
     if has_frame:
         return frame
     return None
+def release_video():
+    global current_capture
+    if current_capture is not None:
+        current_capture.release()
+        current_capture = None
 def get_video_frame_total(video_path: str) -> int:
     capture = cv2.VideoCapture(video_path)

roop/core.py CHANGED Viewed

@@ -14,6 +14,7 @@ import signal
 import torch
 import onnxruntime
 import pathlib
 from time import time
@@ -27,7 +28,7 @@ from roop.face_util import extract_face_images
 from roop.ProcessEntry import ProcessEntry
 from roop.ProcessMgr import ProcessMgr
 from roop.ProcessOptions import ProcessOptions
-from roop.capturer import get_video_frame_total
 clip_text = None
@@ -47,9 +48,12 @@ warnings.filterwarnings('ignore', category=UserWarning, module='torchvision')
 def parse_args() -> None:
     signal.signal(signal.SIGINT, lambda signal_number, frame: destroy())
     roop.globals.headless = False
     # Always enable all processors when using GUI
-    if len(sys.argv) > 1:
-        print('No CLI args supported - use Settings Tab instead')
     roop.globals.frame_processors = ['face_swapper', 'face_enhancer']
@@ -58,8 +62,20 @@ def encode_execution_providers(execution_providers: List[str]) -> List[str]:
 def decode_execution_providers(execution_providers: List[str]) -> List[str]:
-    return [provider for provider, encoded_execution_provider in zip(onnxruntime.get_available_providers(), encode_execution_providers(onnxruntime.get_available_providers()))
             if any(execution_provider in encoded_execution_provider for execution_provider in execution_providers)]
 def suggest_max_memory() -> int:
@@ -204,7 +220,7 @@ def live_swap(frame, options):
     return newframe
-def batch_process_regular(files:list[ProcessEntry], masking_engine:str, new_clip_text:str, use_new_method, imagemask, num_swap_steps, progress, selected_index = 0) -> None:
     global clip_text, process_mgr
     release_resources()
@@ -214,9 +230,11 @@ def batch_process_regular(files:list[ProcessEntry], masking_engine:str, new_clip
     mask = imagemask["layers"][0] if imagemask is not None else None
     if len(roop.globals.INPUT_FACESETS) <= selected_index:
         selected_index = 0
-    options = ProcessOptions(get_processing_plugins(masking_engine), roop.globals.distance_threshold, roop.globals.blend_ratio, roop.globals.face_swap_mode, selected_index, new_clip_text, mask, num_swap_steps, False)
     process_mgr.initialize(roop.globals.INPUT_FACESETS, roop.globals.TARGET_FACES, options)
-    batch_process(files, use_new_method)
     return
 def batch_process_with_options(files:list[ProcessEntry], options, progress):
@@ -230,11 +248,11 @@ def batch_process_with_options(files:list[ProcessEntry], options, progress):
     roop.globals.keep_frames = False
     roop.globals.wait_after_extraction = False
     roop.globals.skip_audio = False
-    batch_process(files, True)
-def batch_process(files:list[ProcessEntry], use_new_method) -> None:
     global clip_text, process_mgr
     roop.globals.processing = True
@@ -287,9 +305,12 @@ def batch_process(files:list[ProcessEntry], use_new_method) -> None:
             if v.endframe == 0:
                 v.endframe = get_video_frame_total(v.filename)
-            update_status(f'Creating {os.path.basename(v.finalname)} with {fps} FPS...')
             start_processing = time()
-            if roop.globals.keep_frames or not use_new_method:
                 util.create_temp(v.filename)
                 update_status('Extracting frames...')
                 ffmpeg.extract_frames(v.filename,v.startframe,v.endframe, fps)
@@ -317,7 +338,7 @@ def batch_process(files:list[ProcessEntry], use_new_method) -> None:
                     skip_audio = True
                 else:
                     skip_audio = roop.globals.skip_audio
-                process_mgr.run_batch_inmem(v.filename, v.finalname, v.startframe, v.endframe, fps,roop.globals.execution_threads, skip_audio)
             if not roop.globals.processing:
                 end_processing('Processing stopped!')
@@ -346,10 +367,12 @@ def batch_process(files:list[ProcessEntry], use_new_method) -> None:
                             os.remove(video_file_name)
                     else:
                         shutil.move(video_file_name, destination)
-                update_status(f'\nProcessing {os.path.basename(destination)} took {time() - start_processing} secs')
-            else:
                 update_status(f'Failed processing {os.path.basename(v.finalname)}!')
     end_processing('Finished')
@@ -371,8 +394,11 @@ def run() -> None:
     if not pre_check():
         return
     roop.globals.CFG = Settings('config.yaml')
     roop.globals.execution_threads = roop.globals.CFG.max_threads
     roop.globals.video_encoder = roop.globals.CFG.output_video_codec
     roop.globals.video_quality = roop.globals.CFG.video_quality
     roop.globals.max_memory = roop.globals.CFG.memory_limit if roop.globals.CFG.memory_limit > 0 else None
     main.run()

 import torch
 import onnxruntime
 import pathlib
+import argparse
 from time import time
 from roop.ProcessEntry import ProcessEntry
 from roop.ProcessMgr import ProcessMgr
 from roop.ProcessOptions import ProcessOptions
+from roop.capturer import get_video_frame_total, release_video
 clip_text = None
 def parse_args() -> None:
     signal.signal(signal.SIGINT, lambda signal_number, frame: destroy())
     roop.globals.headless = False
+    program = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=100))
+    program.add_argument('--server_share', help='Public server', dest='server_share', action='store_true', default=False)
+    program.add_argument('--cuda_device_id', help='Index of the cuda gpu to use', dest='cuda_device_id', type=int, default=0)
+    roop.globals.startup_args = program.parse_args()
     # Always enable all processors when using GUI
     roop.globals.frame_processors = ['face_swapper', 'face_enhancer']
 def decode_execution_providers(execution_providers: List[str]) -> List[str]:
+    list_providers = [provider for provider, encoded_execution_provider in zip(onnxruntime.get_available_providers(), encode_execution_providers(onnxruntime.get_available_providers()))
             if any(execution_provider in encoded_execution_provider for execution_provider in execution_providers)]
+    try:
+        for i in range(len(list_providers)):
+            if list_providers[i] == 'CUDAExecutionProvider':
+                list_providers[i] = ('CUDAExecutionProvider', {'device_id': roop.globals.cuda_device_id})
+                torch.cuda.set_device(roop.globals.cuda_device_id)
+                break
+    except:
+        pass
+    return list_providers
 def suggest_max_memory() -> int:
     return newframe
+def batch_process_regular(output_method, files:list[ProcessEntry], masking_engine:str, new_clip_text:str, use_new_method, imagemask, restore_original_mouth, num_swap_steps, progress, selected_index = 0) -> None:
     global clip_text, process_mgr
     release_resources()
     mask = imagemask["layers"][0] if imagemask is not None else None
     if len(roop.globals.INPUT_FACESETS) <= selected_index:
         selected_index = 0
+    options = ProcessOptions(get_processing_plugins(masking_engine), roop.globals.distance_threshold, roop.globals.blend_ratio,
+                              roop.globals.face_swap_mode, selected_index, new_clip_text, mask, num_swap_steps,
+                              roop.globals.subsample_size, False, restore_original_mouth)
     process_mgr.initialize(roop.globals.INPUT_FACESETS, roop.globals.TARGET_FACES, options)
+    batch_process(output_method, files, use_new_method)
     return
 def batch_process_with_options(files:list[ProcessEntry], options, progress):
     roop.globals.keep_frames = False
     roop.globals.wait_after_extraction = False
     roop.globals.skip_audio = False
+    batch_process("Files", files, True)
+def batch_process(output_method, files:list[ProcessEntry], use_new_method) -> None:
     global clip_text, process_mgr
     roop.globals.processing = True
             if v.endframe == 0:
                 v.endframe = get_video_frame_total(v.filename)
+            is_streaming_only = output_method == "Virtual Camera"
+            if is_streaming_only == False:
+                update_status(f'Creating {os.path.basename(v.finalname)} with {fps} FPS...')
             start_processing = time()
+            if is_streaming_only == False and roop.globals.keep_frames or not use_new_method:
                 util.create_temp(v.filename)
                 update_status('Extracting frames...')
                 ffmpeg.extract_frames(v.filename,v.startframe,v.endframe, fps)
                     skip_audio = True
                 else:
                     skip_audio = roop.globals.skip_audio
+                process_mgr.run_batch_inmem(output_method, v.filename, v.finalname, v.startframe, v.endframe, fps,roop.globals.execution_threads)
             if not roop.globals.processing:
                 end_processing('Processing stopped!')
                             os.remove(video_file_name)
                     else:
                         shutil.move(video_file_name, destination)
+            elif is_streaming_only == False:
                 update_status(f'Failed processing {os.path.basename(v.finalname)}!')
+            elapsed_time = time() - start_processing
+            average_fps = (v.endframe - v.startframe) / elapsed_time
+            update_status(f'\nProcessing {os.path.basename(destination)} took {elapsed_time:.2f} secs, {average_fps:.2f} frames/s')
     end_processing('Finished')
     if not pre_check():
         return
     roop.globals.CFG = Settings('config.yaml')
+    roop.globals.cuda_device_id = roop.globals.startup_args.cuda_device_id
     roop.globals.execution_threads = roop.globals.CFG.max_threads
     roop.globals.video_encoder = roop.globals.CFG.output_video_codec
     roop.globals.video_quality = roop.globals.CFG.video_quality
     roop.globals.max_memory = roop.globals.CFG.memory_limit if roop.globals.CFG.memory_limit > 0 else None
+    if roop.globals.startup_args.server_share:
+        roop.globals.CFG.server_share = True
     main.run()

roop/face_util.py CHANGED Viewed

@@ -9,18 +9,18 @@ import cv2
 import numpy as np
 from skimage import transform as trans
 from roop.capturer import get_video_frame
-from roop.utilities import resolve_relative_path, conditional_download
 FACE_ANALYSER = None
-THREAD_LOCK_ANALYSER = threading.Lock()
-THREAD_LOCK_SWAPPER = threading.Lock()
 FACE_SWAPPER = None
 def get_face_analyser() -> Any:
     global FACE_ANALYSER
-    with THREAD_LOCK_ANALYSER:
         if FACE_ANALYSER is None or roop.globals.g_current_face_analysis != roop.globals.g_desired_face_analysis:
             model_path = resolve_relative_path('..')
             # removed genderage
@@ -210,15 +210,18 @@ arcface_dst = np.array(
 )
-def estimate_norm(lmk, image_size=112, mode="arcface"):
     assert lmk.shape == (5, 2)
-    assert image_size % 112 == 0 or image_size % 128 == 0
     if image_size % 112 == 0:
         ratio = float(image_size) / 112.0
         diff_x = 0
-    else:
         ratio = float(image_size) / 128.0
         diff_x = 8.0 * ratio
     dst = arcface_dst * ratio
     dst[:, 0] += diff_x
     tform = trans.SimilarityTransform()
@@ -230,7 +233,7 @@ def estimate_norm(lmk, image_size=112, mode="arcface"):
 # aligned, M = norm_crop2(f[1], face.kps, 512)
 def align_crop(img, landmark, image_size=112, mode="arcface"):
-    M = estimate_norm(landmark, image_size, mode)
     warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
     return warped, M

 import numpy as np
 from skimage import transform as trans
 from roop.capturer import get_video_frame
+from roop.utilities import resolve_relative_path, conditional_thread_semaphore
 FACE_ANALYSER = None
+#THREAD_LOCK_ANALYSER = threading.Lock()
+#THREAD_LOCK_SWAPPER = threading.Lock()
 FACE_SWAPPER = None
 def get_face_analyser() -> Any:
     global FACE_ANALYSER
+    with conditional_thread_semaphore():
         if FACE_ANALYSER is None or roop.globals.g_current_face_analysis != roop.globals.g_desired_face_analysis:
             model_path = resolve_relative_path('..')
             # removed genderage
 )
+def estimate_norm(lmk, image_size=112):
     assert lmk.shape == (5, 2)
     if image_size % 112 == 0:
         ratio = float(image_size) / 112.0
         diff_x = 0
+    elif image_size % 128 == 0:
         ratio = float(image_size) / 128.0
         diff_x = 8.0 * ratio
+    elif image_size % 512 == 0:
+        ratio = float(image_size) / 512.0
+        diff_x = 32.0 * ratio
     dst = arcface_dst * ratio
     dst[:, 0] += diff_x
     tform = trans.SimilarityTransform()
 # aligned, M = norm_crop2(f[1], face.kps, 512)
 def align_crop(img, landmark, image_size=112, mode="arcface"):
+    M = estimate_norm(landmark, image_size)
     warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
     return warped, M

roop/globals.py CHANGED Viewed

@@ -5,7 +5,9 @@ source_path = None
 target_path = None
 output_path = None
 target_folder_path = None
 frame_processors: List[str] = []
 keep_fps = None
 keep_frames = None
@@ -26,6 +28,7 @@ execution_threads = None
 headless = None
 log_level = 'error'
 selected_enhancer = None
 face_swap_mode = None
 blend_ratio = 0.5
 distance_threshold = 0.65

 target_path = None
 output_path = None
 target_folder_path = None
+startup_args = None
+cuda_device_id = 0
 frame_processors: List[str] = []
 keep_fps = None
 keep_frames = None
 headless = None
 log_level = 'error'
 selected_enhancer = None
+subsample_size = 128
 face_swap_mode = None
 blend_ratio = 0.5
 distance_threshold = 0.65

roop/metadata.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	name = 'roop unleashed'
2	- version = '4.0.0'


1	name = 'roop unleashed'
2	+ version = '4.3.3'

roop/util_ffmpeg.py CHANGED Viewed

@@ -73,12 +73,32 @@ def create_video(target_path: str, dest_filename: str, fps: float = 24.0, temp_d
 def create_gif_from_video(video_path: str, gif_path):
-    from roop.capturer import get_video_frame
     fps = util.detect_fps(video_path)
     frame = get_video_frame(video_path)
-    run_ffmpeg(['-i', video_path, '-vf', f'fps={fps},scale={frame.shape[0]}:-1:flags=lanczos,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse', '-loop', '0', gif_path])
 def restore_audio(intermediate_video: str, original_video: str, trim_frame_start, trim_frame_end, final_video : str) -> None:

 def create_gif_from_video(video_path: str, gif_path):
+    from roop.capturer import get_video_frame, release_video
     fps = util.detect_fps(video_path)
     frame = get_video_frame(video_path)
+    release_video()
+    scalex = frame.shape[0]
+    scaley = frame.shape[1]
+    if scalex >= scaley:
+        scaley = -1
+    else:
+        scalex = -1
+    run_ffmpeg(['-i', video_path, '-vf', f'fps={fps},scale={int(scalex)}:{int(scaley)}:flags=lanczos,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse', '-loop', '0', gif_path])
+def create_video_from_gif(gif_path: str, output_path):
+    fps = util.detect_fps(gif_path)
+    filter = """scale='trunc(in_w/2)*2':'trunc(in_h/2)*2',format=yuv420p,fps=10"""
+    run_ffmpeg(['-i', gif_path, '-vf', f'"{filter}"', '-movflags', '+faststart', '-shortest', output_path])
+def repair_video(original_video: str, final_video : str):
+    run_ffmpeg(['-i', original_video, '-movflags', 'faststart', '-acodec', 'copy', '-vcodec', 'copy', final_video])
 def restore_audio(intermediate_video: str, original_video: str, trim_frame_start, trim_frame_end, final_video : str) -> None:

roop/utilities.py CHANGED Viewed

@@ -13,6 +13,11 @@ import tempfile
 import cv2
 import zipfile
 import traceback
 from pathlib import Path
 from typing import List, Any
@@ -26,6 +31,10 @@ import roop.globals
 TEMP_FILE = "temp.mp4"
 TEMP_DIRECTORY = "temp"
 # monkey patch ssl for mac
 if platform.system().lower() == "darwin":
     ssl._create_default_https_context = ssl._create_unverified_context
@@ -173,6 +182,8 @@ def has_extension(filepath: str, extensions: List[str]) -> bool:
 def is_image(image_path: str) -> bool:
     if image_path and os.path.isfile(image_path):
         mimetype, _ = mimetypes.guess_type(image_path)
         return bool(mimetype and mimetype.startswith("image/"))
     return False
@@ -337,3 +348,31 @@ gradio: {gradio.__version__}
 def compute_cosine_distance(emb1, emb2) -> float:
     return distance.cosine(emb1, emb2)

 import cv2
 import zipfile
 import traceback
+import threading
+import threading
+from typing import Union, Any
+from contextlib import nullcontext
 from pathlib import Path
 from typing import List, Any
 TEMP_FILE = "temp.mp4"
 TEMP_DIRECTORY = "temp"
+THREAD_SEMAPHORE = threading.Semaphore()
+NULL_CONTEXT  = nullcontext()
 # monkey patch ssl for mac
 if platform.system().lower() == "darwin":
     ssl._create_default_https_context = ssl._create_unverified_context
 def is_image(image_path: str) -> bool:
     if image_path and os.path.isfile(image_path):
+        if image_path.endswith(".webp"):
+            return True
         mimetype, _ = mimetypes.guess_type(image_path)
         return bool(mimetype and mimetype.startswith("image/"))
     return False
 def compute_cosine_distance(emb1, emb2) -> float:
     return distance.cosine(emb1, emb2)
+def has_cuda_device():
+    return torch.cuda is not None and torch.cuda.is_available()
+def print_cuda_info():
+    try:
+        print(f'Number of CUDA devices: {torch.cuda.device_count()} Currently used Id: {torch.cuda.current_device()} Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}')
+    except:
+       print('No CUDA device found!')
+def clean_dir(path: str):
+    contents = os.listdir(path)
+    for item in contents:
+        item_path = os.path.join(path, item)
+        try:
+            if os.path.isfile(item_path):
+                os.remove(item_path)
+            elif os.path.isdir(item_path):
+                shutil.rmtree(item_path)
+        except Exception as e:
+            print(e)
+def conditional_thread_semaphore() -> Union[Any, Any]:
+    if 'DmlExecutionProvider' in roop.globals.execution_providers or 'ROCMExecutionProvider' in roop.globals.execution_providers:
+        return THREAD_SEMAPHORE
+    return NULL_CONTEXT

roop/virtualcam.py CHANGED Viewed

@@ -10,7 +10,7 @@ cam_active = False
 cam_thread = None
 vcam = None
-def virtualcamera(streamobs, cam_num,width,height):
     from roop.ProcessOptions import ProcessOptions
     from roop.core import live_swap, get_processing_plugins
@@ -44,10 +44,11 @@ def virtualcamera(streamobs, cam_num,width,height):
         print(f'Using {cam.native_fmt}')
     else:
         print(f'Not streaming to virtual camera!')
-    # always use xseg masking
-    options = ProcessOptions(get_processing_plugins("mask_xseg"), roop.globals.distance_threshold, roop.globals.blend_ratio,
-                              "all", 0, None, None, 1, False)
     while cam_active:
         ret, frame = cap.read()
         if not ret:
@@ -67,12 +68,12 @@ def virtualcamera(streamobs, cam_num,width,height):
-def start_virtual_cam(streamobs, cam_number, resolution):
     global cam_thread, cam_active
     if not cam_active:
         width, height = map(int, resolution.split('x'))
-        cam_thread = threading.Thread(target=virtualcamera, args=[streamobs, cam_number, width, height])
         cam_thread.start()
@@ -83,5 +84,5 @@ def stop_virtual_cam():
     if cam_active:
         cam_active = False
         cam_thread.join()

 cam_thread = None
 vcam = None
+def virtualcamera(streamobs, use_xseg, use_mouthrestore, cam_num,width,height):
     from roop.ProcessOptions import ProcessOptions
     from roop.core import live_swap, get_processing_plugins
         print(f'Using {cam.native_fmt}')
     else:
         print(f'Not streaming to virtual camera!')
+    subsample_size = roop.globals.subsample_size
+    options = ProcessOptions(get_processing_plugins("mask_xseg" if use_xseg else None), roop.globals.distance_threshold, roop.globals.blend_ratio,
+                              "all", 0, None, None, 1, subsample_size, False, use_mouthrestore)
     while cam_active:
         ret, frame = cap.read()
         if not ret:
+def start_virtual_cam(streamobs, use_xseg, use_mouthrestore, cam_number, resolution):
     global cam_thread, cam_active
     if not cam_active:
         width, height = map(int, resolution.split('x'))
+        cam_thread = threading.Thread(target=virtualcamera, args=[streamobs, use_xseg, use_mouthrestore, cam_number, width, height])
         cam_thread.start()
     if cam_active:
         cam_active = False
         cam_thread.join()