import spaces import gradio as gr import torch import cv2 import numpy as np import mediapipe as mp import matplotlib.pyplot as plt from PIL import Image from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, StableDiffusionControlNetInpaintPipeline from transformers import AutoTokenizer import base64 import requests import json from rembg import remove from scipy import ndimage from moviepy.editor import ImageSequenceClip from tqdm import tqdm import os import shutil import time from huggingface_hub import snapshot_download import subprocess import sys @spaces.GPU(duration=120) def download_liveportrait(): """ Clone the LivePortrait repository and prepare its dependencies. """ liveportrait_path = "./LivePortrait" try: if not os.path.exists(liveportrait_path): print("Cloning LivePortrait repository...") os.system(f"git clone {liveportrait_path}") # 安装依赖 os.chdir(liveportrait_path) print("Installing LivePortrait dependencies...") os.system("pip install -r requirements.txt") # 构建 MultiScaleDeformableAttention 模块 dependency_path = "src/utils/dependencies/XPose/models/UniPose/ops" os.chdir(dependency_path) print("Building MultiScaleDeformableAttention...") os.system("python build") os.system("python install") # 确保模块路径可用 module_path = os.path.abspath(dependency_path) if module_path not in sys.path: sys.path.append(module_path) # 返回 LivePortrait 目录 os.chdir("../../../../../../../") print("LivePortrait setup completed") except Exception as e: print("Failed to initialize LivePortrait:", e) raise @spaces.GPU(duration=120) def download_huggingface_resources(): """ Download additional necessary resources from Hugging Face using the CLI. """ try: local_dir = "./pretrained_weights" os.makedirs(local_dir, exist_ok=True) # Use the Hugging Face CLI for downloading cmd = [ "huggingface-cli", "download", "KwaiVGI/LivePortrait", "--local-dir", local_dir, "--exclude", "*.git*", "", "docs" ] print("Executing command:", " ".join(cmd)), check=True) print("Resources successfully downloaded to:", local_dir) except subprocess.CalledProcessError as e: print("Error during Hugging Face CLI download:", e) raise except Exception as e: print("General error in downloading resources:", e) raise @spaces.GPU(duration=120) def get_project_root(): """Get the root directory of the current project.""" return os.path.abspath(os.path.dirname(__file__)) # Ensure working directory is project root os.chdir(get_project_root()) # Initialize the necessary models and components mp_pose = mp_drawing = # Load ControlNet model controlnet = ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-openpose', torch_dtype=torch.float16) # Load Stable Diffusion model with ControlNet pipe_controlnet = StableDiffusionControlNetPipeline.from_pretrained( 'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16 ) # Load Inpaint Controlnet pipe_inpaint_controlnet = StableDiffusionControlNetInpaintPipeline.from_pretrained( "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16 ) # Move to GPU if available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') pipe_controlnet.enable_attention_slicing() pipe_inpaint_controlnet.enable_attention_slicing() @spaces.GPU(duration=120) def resize_to_multiple_of_64(width, height): return (width // 64) * 64, (height // 64) * 64 @spaces.GPU(duration=120) def expand_mask(mask, kernel_size): mask_array = np.array(mask) structuring_element = np.ones((kernel_size, kernel_size), dtype=np.uint8) expanded_mask_array = ndimage.binary_dilation( mask_array, structure=structuring_element ).astype(np.uint8) * 255 return Image.fromarray(expanded_mask_array) @spaces.GPU(duration=120) def crop_face_to_square(image_rgb, padding_ratio=0.2): """ Detects the face in the input image and crops an enlarged square region around it. """ face_cascade = cv2.CascadeClassifier( + 'haarcascade_frontalface_default.xml') gray_image = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY) faces = face_cascade.detectMultiScale(gray_image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) if len(faces) == 0: print("No face detected.") return None x, y, w, h = faces[0] center_x, center_y = x + w // 2, y + h // 2 side_length = max(w, h) padded_side_length = int(side_length * (1 + padding_ratio)) half_side = padded_side_length // 2 top_left_x = max(center_x - half_side, 0) top_left_y = max(center_y - half_side, 0) bottom_right_x = min(center_x + half_side, image_rgb.shape[1]) bottom_right_y = min(center_y + half_side, image_rgb.shape[0]) cropped_image = image_rgb[top_left_y:bottom_right_y, top_left_x:bottom_right_x] resized_image = cv2.resize(cropped_image, (768, 768), interpolation=cv2.INTER_AREA) return resized_image @spaces.GPU(duration=120) def spirit_animal_baseline(image_path, num_images = 4): image = cv2.imread(image_path) image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image_rgb = crop_face_to_square(image_rgb) original_height, original_width, _ = image_rgb.shape aspect_ratio = original_width / original_height if aspect_ratio > 1: gen_width = 768 gen_height = int(gen_width / aspect_ratio) else: gen_height = 768 gen_width = int(gen_height * aspect_ratio) gen_width, gen_height = resize_to_multiple_of_64(gen_width, gen_height) with mp_pose.Pose(static_image_mode=True) as pose: results = pose.process(image_rgb) if results.pose_landmarks: annotated_image = image_rgb.copy() mp_drawing.draw_landmarks( annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS ) else: print("No pose detected.") return "No pose detected.", [] pose_image = np.zeros_like(image_rgb) for connection in mp_pose.POSE_CONNECTIONS: start_idx, end_idx = connection start, end = results.pose_landmarks.landmark[start_idx], results.pose_landmarks.landmark[end_idx] if start.visibility > 0.5 and end.visibility > 0.5: x1, y1 = int(start.x * pose_image.shape[1]), int(start.y * pose_image.shape[0]) x2, y2 = int(end.x * pose_image.shape[1]), int(end.y * pose_image.shape[0]) cv2.line(pose_image, (x1, y1), (x2, y2), (255, 255, 255), 2) pose_pil = Image.fromarray(cv2.resize(pose_image, (gen_width, gen_height), interpolation=cv2.INTER_LANCZOS4)) base64_image = base64.b64encode(cv2.imencode('.jpg', image_rgb)[1]).decode() api_key = "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" headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} payload = { "model": "gpt-4o-mini", "messages": [ { "role": "user", "content": [ {"type": "text", "text": "Based on the provided image, think of one spirit animal that is right for the person, and answer in the following format: An ultra-realistic, highly detailed photograph of a single {animal} with facial features characterized by {description}, standing upright in a human-like pose, looking directly at the camera, against a solid, neutral background. Generate one sentence without any other responses or numbering."}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} ] } ], "max_tokens": 100 } response ="", headers=headers, json=payload) prompt = response.json()['choices'][0]['message']['content'] if 'choices' in response.json() else "A majestic animal" num_images = num_images generated_images = [] with torch.no_grad(): with torch.autocast(device_type=device.type): for _ in range(num_images): images = pipe_controlnet( prompt=prompt, negative_prompt="multiple heads, extra limbs, duplicate faces, mutated anatomy, disfigured, blurry", num_inference_steps=20, image=pose_pil, guidance_scale=5, width=gen_width, height=gen_height, ).images generated_images.append(images[0]) return prompt, generated_images @spaces.GPU(duration=120) def spirit_animal_with_background(image_path, num_images = 4): image = cv2.imread(image_path) image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # image_rgb = crop_face_to_square(image_rgb) original_height, original_width, _ = image_rgb.shape aspect_ratio = original_width / original_height if aspect_ratio > 1: gen_width = 768 gen_height = int(gen_width / aspect_ratio) else: gen_height = 768 gen_width = int(gen_height * aspect_ratio) gen_width, gen_height = resize_to_multiple_of_64(gen_width, gen_height) with mp_pose.Pose(static_image_mode=True) as pose: results = pose.process(image_rgb) if results.pose_landmarks: annotated_image = image_rgb.copy() mp_drawing.draw_landmarks( annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS ) else: print("No pose detected.") return "No pose detected.", [] pose_image = np.zeros_like(image_rgb) for connection in mp_pose.POSE_CONNECTIONS: start_idx, end_idx = connection start, end = results.pose_landmarks.landmark[start_idx], results.pose_landmarks.landmark[end_idx] if start.visibility > 0.5 and end.visibility > 0.5: x1, y1 = int(start.x * pose_image.shape[1]), int(start.y * pose_image.shape[0]) x2, y2 = int(end.x * pose_image.shape[1]), int(end.y * pose_image.shape[0]) cv2.line(pose_image, (x1, y1), (x2, y2), (255, 255, 255), 2) pose_pil = Image.fromarray(cv2.resize(pose_image, (gen_width, gen_height), interpolation=cv2.INTER_LANCZOS4)) base64_image = base64.b64encode(cv2.imencode('.jpg', image_rgb)[1]).decode() api_key = "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" headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} payload = { "model": "gpt-4o-mini", "messages": [ { "role": "user", "content": [ {"type": "text", "text": "Based on the provided image, think of one spirit animal that is right for the person, and answer in the following format: An ultra-realistic, highly detailed photograph of a single {animal} with facial features characterized by {description}, standing upright in a human-like pose, looking directly at the camera, against a solid, neutral background. Generate one sentence without any other responses or numbering."}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} ] } ], "max_tokens": 100 } response ="", headers=headers, json=payload) prompt = response.json()['choices'][0]['message']['content'] if 'choices' in response.json() else "A majestic animal" mask_image = remove(Image.fromarray(image_rgb)) initial_mask = mask_image.split()[-1].convert('L') kernel_size = min(gen_width, gen_height) // 15 expanded_mask = expand_mask(initial_mask, kernel_size) num_images = num_images generated_images = [] with torch.no_grad(): with torch.autocast(device_type=device.type): for _ in range(num_images): images = pipe_inpaint_controlnet( prompt=prompt, negative_prompt="multiple heads, extra limbs, duplicate faces, mutated anatomy, disfigured, blurry", num_inference_steps=20, image=Image.fromarray(image_rgb), mask_image=expanded_mask, control_image=pose_pil, width=gen_width, height=gen_height, guidance_scale=5, ).images generated_images.append(images[0]) return prompt, generated_images @spaces.GPU(duration=120) def generate_multiple_animals(image_path, keep_background=True, num_images = 4): image = cv2.imread(image_path) image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image_rgb = crop_face_to_square(image_rgb) original_image = Image.fromarray(image_rgb) original_width, original_height = original_image.size aspect_ratio = original_width / original_height if aspect_ratio > 1: gen_width = 768 gen_height = int(gen_width / aspect_ratio) else: gen_height = 768 gen_width = int(gen_height * aspect_ratio) gen_width, gen_height = resize_to_multiple_of_64(gen_width, gen_height) base64_image = base64.b64encode(cv2.imencode('.jpg', image_rgb)[1]).decode() api_key = "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" headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} payload = { "model": "gpt-4o-mini", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Based on the provided image, think of " + str(num_images) + " different spirit animals that are right for the person, and answer in the following format for each: An ultra-realistic, highly detailed photograph of a {animal} with facial features characterized by {description}, standing upright in a human-like pose, looking directly at the camera, against a solid, neutral background. Generate these sentences without any other responses or numbering. For the animal choose between owl, bear, fox, koala, lion, dog" }, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"} } ] } ], "max_tokens": 500 } response ="", headers=headers, json=payload) response_json = response.json() if 'choices' in response_json and len(response_json['choices']) > 0: content = response_json['choices'][0]['message']['content'] prompts = [prompt.strip() for prompt in content.strip().split('.') if prompt.strip()] negative_prompt = ( "multiple heads, extra limbs, duplicate faces, mutated anatomy, disfigured, " "blurry, deformed, text, watermark, logo, low resolution" ) formatted_prompts = "\n".join(f"{i+1}. {prompt}" for i, prompt in enumerate(prompts)) with mp_pose.Pose(static_image_mode=True) as pose: results = pose.process(image_rgb) if results.pose_landmarks: annotated_image = image_rgb.copy() mp_drawing.draw_landmarks( annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS ) else: print("No pose detected.") return "No pose detected.", [] pose_image = np.zeros_like(image_rgb) for connection in mp_pose.POSE_CONNECTIONS: start_idx, end_idx = connection start, end = results.pose_landmarks.landmark[start_idx], results.pose_landmarks.landmark[end_idx] if start.visibility > 0.5 and end.visibility > 0.5: x1, y1 = int(start.x * pose_image.shape[1]), int(start.y * pose_image.shape[0]) x2, y2 = int(end.x * pose_image.shape[1]), int(end.y * pose_image.shape[0]) cv2.line(pose_image, (x1, y1), (x2, y2), (255, 255, 255), 2) pose_pil = Image.fromarray(cv2.resize(pose_image, (gen_width, gen_height), interpolation=cv2.INTER_LANCZOS4)) if keep_background: mask_image = remove(original_image) initial_mask = mask_image.split()[-1].convert('L') expanded_mask = expand_mask(initial_mask, kernel_size=min(gen_width, gen_height) // 15) else: expanded_mask = None generated_images = [] if keep_background: with torch.no_grad(): with torch.amp.autocast("cuda"): for prompt in prompts: images = pipe_inpaint_controlnet( prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=20, image=Image.fromarray(image_rgb), mask_image=expanded_mask, control_image=pose_pil, width=gen_width, height=gen_height, guidance_scale=5, ).images generated_images.append(images[0]) else: with torch.no_grad(): with torch.amp.autocast("cuda"): for prompt in prompts: images = pipe_controlnet( prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=20, image=pose_pil, guidance_scale=5, width=gen_width, height=gen_height, ).images generated_images.append(images[0]) return formatted_prompts, generated_images @spaces.GPU(duration=120) def wait_for_file(file_path, timeout=500): """ Wait for a file to be created, with a specified timeout. Args: file_path (str): The path of the file to wait for. timeout (int): Maximum time to wait in seconds. Returns: bool: True if the file is created, False if timeout occurs. """ start_time = time.time() while not os.path.exists(file_path): if time.time() - start_time > timeout: return False time.sleep(0.5) # Check every 0.5 seconds return True @spaces.GPU(duration=120) def generate_spirit_animal_video(driving_video_path): os.chdir(".") try: # Step 1: Extract the first frame cap = cv2.VideoCapture(driving_video_path) if not cap.isOpened(): print("Error: Unable to open video.") return None ret, frame = cap.release() if not ret: print("Error: Unable to read the first frame.") return None # Save the first frame first_frame_path = "./first_frame.jpg" cv2.imwrite(first_frame_path, frame) print(f"First frame saved to: {first_frame_path}") # Generate spirit animal image _, input_image = generate_multiple_animals(first_frame_path, True, 1) if input_image is None or not input_image: print("Error: Spirit animal generation failed.") return None spirit_animal_path = "./animal.jpeg" cv2.imwrite(spirit_animal_path, cv2.cvtColor(np.array(input_image[0]), cv2.COLOR_RGB2BGR)) print(f"Spirit animal image saved to: {spirit_animal_path}") # Step 3: Run inference output_path = "./animations/animal--uploaded_video_compressed.mp4" script_path = os.path.abspath("../LivePortrait/") if not os.path.exists(script_path): print(f"Error: Inference script not found at {script_path}.") return None command = f"python {script_path} -s {spirit_animal_path} -d {driving_video_path} --driving_multiplier 1.75 --no_flag_stitching" print(f"Running command: {command}") result = os.system(command) if result != 0: print(f"Error: Command failed with exit code {result}.") return None # Verify output file exists if not os.path.exists(output_path): print(f"Error: Expected output video not found at {output_path}.") return None print(f"Output video generated at: {output_path}") return output_path except Exception as e: print(f"Error occurred: {e}") return None @spaces.GPU(duration=120) def generate_spirit_animal(image, animal_type, background): if animal_type == "Single Animal": if background == "Preserve Background": prompt, generated_images = spirit_animal_with_background(image) else: prompt, generated_images = spirit_animal_baseline(image) elif animal_type == "Multiple Animals": if background == "Preserve Background": prompt, generated_images = generate_multiple_animals(image, keep_background=True) else: prompt, generated_images = generate_multiple_animals(image, keep_background=False) return prompt, generated_images @spaces.GPU(duration=120) def compress_video(input_path, output_path, target_size_mb): target_size_bytes = target_size_mb * 1024 * 1024 temp_output = "./temp_compressed.mp4" cap = cv2.VideoCapture(input_path) fourcc = cv2.VideoWriter_fourcc(*'mp4v') # 使用 mp4 编码 fps = int(cap.get(cv2.CAP_PROP_FPS)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) writer = cv2.VideoWriter(temp_output, fourcc, fps, (width, height)) while cap.isOpened(): ret, frame = if not ret: break writer.write(frame) cap.release() writer.release() current_size = os.path.getsize(temp_output) if current_size > target_size_bytes: bitrate = int(target_size_bytes * 8 / (current_size / target_size_bytes)) # 按比例缩减比特率 os.system(f"ffmpeg -i {temp_output} -b:v {bitrate} -y {output_path}") os.remove(temp_output) else: shutil.move(temp_output, output_path) @spaces.GPU(duration=120) def process_video(video_file): # 初始化 LivePortrait try: download_liveportrait() except Exception as e: print("Failed to initialize LivePortrait:", e) return gr.update(value=None, visible=False) # 下载 Hugging Face 资源 try: download_huggingface_resources() except Exception as e: print("Failed to download Hugging Face resources:", e) return gr.update(value=None, visible=False) compressed_path = "./uploaded_video_compressed.mp4" compress_video(video_file, compressed_path, target_size_mb=1) print(f"Compressed and moved video to: {compressed_path}") output_video_path = "./animations/animal--uploaded_video_compressed.mp4" generate_spirit_animal_video(compressed_path) # Wait until the output video is generated timeout = 60000 # Timeout in seconds if not wait_for_file(output_video_path, timeout=timeout): print("Timeout occurred while waiting for video generation.") return gr.update(value=None, visible=False) # Hide output if failed # Return the generated video path print(f"Output video is ready: {output_video_path}") return gr.update(value=output_video_path, visible=True) # Show video # Custom CSS styling for the interface css = """ #title-container { font-family: 'Arial', sans-serif; color: #4a4a4a; text-align: center; margin-bottom: 20px; } #title-container h1 { font-size: 2.5em; font-weight: bold; color: #ff9900; } #title-container h2 { font-size: 1.2em; color: #6c757d; } #intro-text { font-size: 1em; color: #6c757d; margin: 50px; text-align: center; font-style: italic; } #prompt-output { font-family: 'Courier New', monospace; color: #5a5a5a; font-size: 1.1em; padding: 10px; background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; margin-top: 10px; } """ # Title and description title_html = """

Spirit Animal Generator

Create your unique spirit animal with AI-assisted image generation.

""" description_text = """ ### Project Overview Welcome to the Spirit Animal Generator! This tool leverages advanced AI technologies to create unique visualizations of spirit animals from both videos and images. #### Key Features: 1. **Video Transformation**: Upload a driving video to generate a creative spirit animal animation. 2. **Image Creation**: Upload an image and customize the spirit animal type and background options. 3. **AI-Powered Prompting**: OpenAI's GPT generates descriptive prompts for each input. 4. **High-Quality Outputs**: Generated using Stable Diffusion and ControlNet for stunning visuals. --- ### How It Works: 1. **Upload Your Media**: - Videos: Ensure the file is in MP4 format. - Images: Use clear, high-resolution photos for better results. 2. **Customize Options**: - For images, select the type of animal and background settings. 3. **View Your Results**: - Videos will be transformed into animations. - Images will produce customized visual art along with a generated prompt. Discover your spirit animal and let your imagination run wild! --- """ with gr.Blocks() as demo: gr.HTML(title_html) gr.Markdown(description_text) with gr.Tabs(): with gr.Tab("Generate Spirit Animal Image"): gr.Markdown("Upload an image to generate a spirit animal.") with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="filepath", label="Upload an image") animal_type = gr.Radio(choices=["Single Animal", "Multiple Animals"], label="Animal Type", value="Single Animal") background_option = gr.Radio(choices=["Preserve Background", "Don't Preserve Background"], label="Background Option", value="Preserve Background") generate_image_button = gr.Button("Generate Image") with gr.Column(scale=1): generated_prompt = gr.Textbox(label="Generated Prompt") generated_gallery = gr.Gallery(label="Generated Images") fn=generate_spirit_animal, inputs=[image_input, animal_type, background_option], outputs=[generated_prompt, generated_gallery], ) with gr.Tab("Generate Spirit Animal Video"): gr.Markdown("Upload a driving video to generate a spirit animal video.") with gr.Row(): with gr.Column(scale=1): video_input = gr.Video(label="Upload a driving video (MP4 format)") generate_video_button = gr.Button("Generate Video") with gr.Column(scale=1): video_output = gr.Video(label="Generated Spirit Animal Video") fn=process_video, inputs=video_input, outputs=video_output, ) demo.launch()