rangm commited on
Commit
894e005
·
verified ·
1 Parent(s): accd1a5

Update webgui.py

Browse files
Files changed (1) hide show
  1. webgui.py +129 -24
webgui.py CHANGED
@@ -29,6 +29,11 @@ import gradio as gr
29
 
30
  import huggingface_hub
31
 
 
 
 
 
 
32
  huggingface_hub.snapshot_download(
33
  repo_id='BadToBest/EchoMimic',
34
  local_dir='./pretrained_weights',
@@ -151,13 +156,71 @@ def select_face(det_bboxes, probs):
151
  return sorted_bboxes[0]
152
 
153
  @spaces.GPU
154
- def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
155
-
156
- if seed is not None and seed > -1:
157
- generator = torch.manual_seed(seed)
158
- else:
159
- generator = torch.manual_seed(random.randint(100, 1000000))
160
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  #### face musk prepare
162
  face_img = cv2.imread(uploaded_img)
163
  face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
@@ -182,9 +245,40 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
182
  face_img = cv2.resize(face_img, (width, height))
183
  face_mask = cv2.resize(face_mask, (width, height))
184
 
185
- ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
187
 
 
 
 
 
188
  video = pipe(
189
  ref_image_pil,
190
  uploaded_audio,
@@ -194,7 +288,7 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
194
  length,
195
  steps,
196
  cfg,
197
- generator=generator,
198
  audio_sample_rate=sample_rate,
199
  context_frames=context_frames,
200
  fps=fps,
@@ -290,7 +384,18 @@ with gr.Blocks() as demo:
290
  </div>
291
  """)
292
 
293
- def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  final_output_path = process_video(
296
  uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device
@@ -303,19 +408,19 @@ with gr.Blocks() as demo:
303
  inputs=[
304
  uploaded_img,
305
  uploaded_audio,
306
- width,
307
- height,
308
- length,
309
- seed,
310
- facemask_dilation_ratio,
311
- facecrop_dilation_ratio,
312
- context_frames,
313
- context_overlap,
314
- cfg,
315
- steps,
316
- sample_rate,
317
- fps,
318
- device
319
  ],
320
  outputs=output_video,
321
  show_api=False
@@ -329,4 +434,4 @@ args = parser.parse_args()
329
 
330
  if __name__ == '__main__':
331
  demo.queue(max_size=3).launch(show_api=False, show_error=True)
332
- #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
 
29
 
30
  import huggingface_hub
31
 
32
+ import pickle
33
+ from src.utils.draw_utils import FaceMeshVisualizer
34
+ from src.utils.motion_utils import motion_sync
35
+ from src.utils.mp_utils import LMKExtractor
36
+
37
  huggingface_hub.snapshot_download(
38
  repo_id='BadToBest/EchoMimic',
39
  local_dir='./pretrained_weights',
 
156
  return sorted_bboxes[0]
157
 
158
  @spaces.GPU
159
+ lmk_extractor = LMKExtractor()
160
+ # def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
161
+
162
+ # if seed is not None and seed > -1:
163
+ # generator = torch.manual_seed(seed)
164
+ # else:
165
+ # generator = torch.manual_seed(random.randint(100, 1000000))
166
+
167
+ # #### face musk prepare
168
+ # face_img = cv2.imread(uploaded_img)
169
+ # face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
170
+ # det_bboxes, probs = face_detector.detect(face_img)
171
+ # select_bbox = select_face(det_bboxes, probs)
172
+ # if select_bbox is None:
173
+ # face_mask[:, :] = 255
174
+ # else:
175
+ # xyxy = select_bbox[:4]
176
+ # xyxy = np.round(xyxy).astype('int')
177
+ # rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
178
+ # r_pad = int((re - rb) * facemask_dilation_ratio)
179
+ # c_pad = int((ce - cb) * facemask_dilation_ratio)
180
+ # face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
181
+
182
+ # #### face crop
183
+ # r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
184
+ # c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
185
+ # crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
186
+ # face_img = crop_and_pad(face_img, crop_rect)
187
+ # face_mask = crop_and_pad(face_mask, crop_rect)
188
+ # face_img = cv2.resize(face_img, (width, height))
189
+ # face_mask = cv2.resize(face_mask, (width, height))
190
+
191
+ # ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
192
+ # face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
193
+
194
+ # video = pipe(
195
+ # ref_image_pil,
196
+ # uploaded_audio,
197
+ # face_mask_tensor,
198
+ # width,
199
+ # height,
200
+ # length,
201
+ # steps,
202
+ # cfg,
203
+ # generator=generator,
204
+ # audio_sample_rate=sample_rate,
205
+ # context_frames=context_frames,
206
+ # fps=fps,
207
+ # context_overlap=context_overlap
208
+ # ).videos
209
+
210
+ # save_dir = Path("output/tmp")
211
+ # save_dir.mkdir(exist_ok=True, parents=True)
212
+ # output_video_path = save_dir / "output_video.mp4"
213
+ # save_videos_grid(video, str(output_video_path), n_rows=1, fps=fps)
214
+
215
+ # video_clip = VideoFileClip(str(output_video_path))
216
+ # audio_clip = AudioFileClip(uploaded_audio)
217
+ # final_output_path = save_dir / "output_video_with_audio.mp4"
218
+ # video_clip = video_clip.set_audio(audio_clip)
219
+ # video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
220
+
221
+ # return final_output_path
222
+
223
+ def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
224
  #### face musk prepare
225
  face_img = cv2.imread(uploaded_img)
226
  face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
 
245
  face_img = cv2.resize(face_img, (width, height))
246
  face_mask = cv2.resize(face_mask, (width, height))
247
 
248
+
249
+ # ==================== face_locator =====================
250
+ '''
251
+ driver_video = "./assets/driven_videos/c.mp4"
252
+
253
+ input_frames_cv2 = [cv2.resize(center_crop_cv2(pil_to_cv2(i)), (512, 512)) for i in pils_from_video(driver_video)]
254
+ ref_det = lmk_extractor(face_img)
255
+
256
+ visualizer = FaceMeshVisualizer(draw_iris=False, draw_mouse=False)
257
+
258
+ pose_list = []
259
+ sequence_driver_det = []
260
+ try:
261
+ for frame in input_frames_cv2:
262
+ result = lmk_extractor(frame)
263
+ assert result is not None, "{}, bad video, face not detected".format(driver_video)
264
+ sequence_driver_det.append(result)
265
+ except:
266
+ print("face detection failed")
267
+ exit()
268
+
269
+ sequence_det_ms = motion_sync(sequence_driver_det, ref_det)
270
+ for p in sequence_det_ms:
271
+ tgt_musk = visualizer.draw_landmarks((width, height), p)
272
+ tgt_musk_pil = Image.fromarray(np.array(tgt_musk).astype(np.uint8)).convert('RGB')
273
+ pose_list.append(torch.Tensor(np.array(tgt_musk_pil)).to(dtype=weight_dtype, device="cuda").permute(2,0,1) / 255.0)
274
+ '''
275
+ # face_mask_tensor = torch.stack(pose_list, dim=1).unsqueeze(0)
276
  face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
277
 
278
+ ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
279
+
280
+ #del pose_list, sequence_det_ms, sequence_driver_det, input_frames_cv2
281
+
282
  video = pipe(
283
  ref_image_pil,
284
  uploaded_audio,
 
288
  length,
289
  steps,
290
  cfg,
291
+ #generator=generator,
292
  audio_sample_rate=sample_rate,
293
  context_frames=context_frames,
294
  fps=fps,
 
384
  </div>
385
  """)
386
 
387
+ def generate_video(uploaded_img, uploaded_audio, facemask_dilation_ratio=default_values["facemask_dilation_ratio"],
388
+ facecrop_dilation_ratio=default_values["facecrop_dilation_ratio"],
389
+ context_frames=default_values["context_frames"],
390
+ context_overlap=default_values["context_overlap"],
391
+ cfg=default_values["cfg"],
392
+ steps=default_values["steps"],
393
+ sample_rate=default_values["sample_rate"],
394
+ fps=default_values["fps"],
395
+ device=default_values["device"],
396
+ width=default_values["width"],
397
+ height=default_values["height"],
398
+ length=default_values["length"] ):
399
 
400
  final_output_path = process_video(
401
  uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device
 
408
  inputs=[
409
  uploaded_img,
410
  uploaded_audio,
411
+ # width,
412
+ # height,
413
+ # length,
414
+ # seed,
415
+ # facemask_dilation_ratio,
416
+ # facecrop_dilation_ratio,
417
+ # context_frames,
418
+ # context_overlap,
419
+ # cfg,
420
+ # steps,
421
+ # sample_rate,
422
+ # fps,
423
+ # device
424
  ],
425
  outputs=output_video,
426
  show_api=False
 
434
 
435
  if __name__ == '__main__':
436
  demo.queue(max_size=3).launch(show_api=False, show_error=True)
437
+ #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)