EchoMimic

Running on Zero

App Files Files Community

rangm commited on Aug 2, 2024

Commit

894e005

verified ·

1 Parent(s): accd1a5

Update webgui.py

Browse files

Files changed (1) hide show

webgui.py +129 -24

webgui.py CHANGED Viewed

@@ -29,6 +29,11 @@ import gradio as gr
 import huggingface_hub
 huggingface_hub.snapshot_download(
     repo_id='BadToBest/EchoMimic',
     local_dir='./pretrained_weights',
@@ -151,13 +156,71 @@ def select_face(det_bboxes, probs):
     return sorted_bboxes[0]
 @spaces.GPU
-def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
-    if seed is not None and seed > -1:
-        generator = torch.manual_seed(seed)
-    else:
-        generator = torch.manual_seed(random.randint(100, 1000000))
     #### face musk prepare
     face_img = cv2.imread(uploaded_img)
     face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
@@ -182,9 +245,40 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
         face_img = cv2.resize(face_img, (width, height))
         face_mask = cv2.resize(face_mask, (width, height))
-    ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
     face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
     video = pipe(
         ref_image_pil,
         uploaded_audio,
@@ -194,7 +288,7 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
         length,
         steps,
         cfg,
-        generator=generator,
         audio_sample_rate=sample_rate,
         context_frames=context_frames,
         fps=fps,
@@ -290,7 +384,18 @@ with gr.Blocks() as demo:
             </div>
             """)
-    def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
         final_output_path = process_video(
             uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device
@@ -303,19 +408,19 @@ with gr.Blocks() as demo:
         inputs=[
             uploaded_img,
             uploaded_audio,
-            width,
-            height,
-            length,
-            seed,
-            facemask_dilation_ratio,
-            facecrop_dilation_ratio,
-            context_frames,
-            context_overlap,
-            cfg,
-            steps,
-            sample_rate,
-            fps,
-            device
         ],
         outputs=output_video,
         show_api=False
@@ -329,4 +434,4 @@ args = parser.parse_args()
 if __name__ == '__main__':
     demo.queue(max_size=3).launch(show_api=False, show_error=True)
-    #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)

 import huggingface_hub
+import pickle
+from src.utils.draw_utils import FaceMeshVisualizer
+from src.utils.motion_utils import motion_sync
+from src.utils.mp_utils  import LMKExtractor
 huggingface_hub.snapshot_download(
     repo_id='BadToBest/EchoMimic',
     local_dir='./pretrained_weights',
     return sorted_bboxes[0]
 @spaces.GPU
+lmk_extractor = LMKExtractor()
+# def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
+#     if seed is not None and seed > -1:
+#         generator = torch.manual_seed(seed)
+#     else:
+#         generator = torch.manual_seed(random.randint(100, 1000000))
+#     #### face musk prepare
+#     face_img = cv2.imread(uploaded_img)
+#     face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
+#     det_bboxes, probs = face_detector.detect(face_img)
+#     select_bbox = select_face(det_bboxes, probs)
+#     if select_bbox is None:
+#         face_mask[:, :] = 255
+#     else:
+#         xyxy = select_bbox[:4]
+#         xyxy = np.round(xyxy).astype('int')
+#         rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
+#         r_pad = int((re - rb) * facemask_dilation_ratio)
+#         c_pad = int((ce - cb) * facemask_dilation_ratio)
+#         face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
+#         #### face crop
+#         r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
+#         c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
+#         crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
+#         face_img = crop_and_pad(face_img, crop_rect)
+#         face_mask = crop_and_pad(face_mask, crop_rect)
+#         face_img = cv2.resize(face_img, (width, height))
+#         face_mask = cv2.resize(face_mask, (width, height))
+#     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
+#     face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
+#     video = pipe(
+#         ref_image_pil,
+#         uploaded_audio,
+#         face_mask_tensor,
+#         width,
+#         height,
+#         length,
+#         steps,
+#         cfg,
+#         generator=generator,
+#         audio_sample_rate=sample_rate,
+#         context_frames=context_frames,
+#         fps=fps,
+#         context_overlap=context_overlap
+#     ).videos
+#     save_dir = Path("output/tmp")
+#     save_dir.mkdir(exist_ok=True, parents=True)
+#     output_video_path = save_dir / "output_video.mp4"
+#     save_videos_grid(video, str(output_video_path), n_rows=1, fps=fps)
+#     video_clip = VideoFileClip(str(output_video_path))
+#     audio_clip = AudioFileClip(uploaded_audio)
+#     final_output_path = save_dir / "output_video_with_audio.mp4"
+#     video_clip = video_clip.set_audio(audio_clip)
+#     video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
+#     return final_output_path
+def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
     #### face musk prepare
     face_img = cv2.imread(uploaded_img)
     face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
         face_img = cv2.resize(face_img, (width, height))
         face_mask = cv2.resize(face_mask, (width, height))
+    # ==================== face_locator =====================
+    '''
+    driver_video = "./assets/driven_videos/c.mp4"
+    input_frames_cv2 = [cv2.resize(center_crop_cv2(pil_to_cv2(i)), (512, 512)) for i in pils_from_video(driver_video)]
+    ref_det = lmk_extractor(face_img)
+    visualizer = FaceMeshVisualizer(draw_iris=False, draw_mouse=False)
+    pose_list = []
+    sequence_driver_det = []
+    try:
+        for frame in input_frames_cv2:
+            result = lmk_extractor(frame)
+            assert result is not None, "{}, bad video, face not detected".format(driver_video)
+            sequence_driver_det.append(result)
+    except:
+        print("face detection failed")
+        exit()
+    sequence_det_ms = motion_sync(sequence_driver_det, ref_det)
+    for p in sequence_det_ms:
+        tgt_musk = visualizer.draw_landmarks((width, height), p)
+        tgt_musk_pil = Image.fromarray(np.array(tgt_musk).astype(np.uint8)).convert('RGB')
+        pose_list.append(torch.Tensor(np.array(tgt_musk_pil)).to(dtype=weight_dtype, device="cuda").permute(2,0,1) / 255.0)
+    '''
+    # face_mask_tensor = torch.stack(pose_list, dim=1).unsqueeze(0)
     face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
+    ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
+    #del pose_list, sequence_det_ms, sequence_driver_det, input_frames_cv2
     video = pipe(
         ref_image_pil,
         uploaded_audio,
         length,
         steps,
         cfg,
+        #generator=generator,
         audio_sample_rate=sample_rate,
         context_frames=context_frames,
         fps=fps,
             </div>
             """)
+    def generate_video(uploaded_img, uploaded_audio, facemask_dilation_ratio=default_values["facemask_dilation_ratio"],
+                       facecrop_dilation_ratio=default_values["facecrop_dilation_ratio"],
+                       context_frames=default_values["context_frames"],
+                       context_overlap=default_values["context_overlap"],
+                       cfg=default_values["cfg"],
+                       steps=default_values["steps"],
+                       sample_rate=default_values["sample_rate"],
+                       fps=default_values["fps"],
+                       device=default_values["device"],
+                       width=default_values["width"],
+                       height=default_values["height"],
+                       length=default_values["length"] ):
         final_output_path = process_video(
             uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device
         inputs=[
             uploaded_img,
             uploaded_audio,
+            # width,
+            # height,
+            # length,
+            # seed,
+            # facemask_dilation_ratio,
+            # facecrop_dilation_ratio,
+            # context_frames,
+            # context_overlap,
+            # cfg,
+            # steps,
+            # sample_rate,
+            # fps,
+            # device
         ],
         outputs=output_video,
         show_api=False
 if __name__ == '__main__':
     demo.queue(max_size=3).launch(show_api=False, show_error=True)
+    #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)