Spaces:

Xuanyou
/

Spirit_Animals_Example

Runtime error

App Files Files Community

yy1636 commited on Dec 5, 2024

Commit

1919c24

verified ·

1 Parent(s): 0b0244d

Create app.py

Browse files

Files changed (1) hide show

app.py +697 -0

app.py ADDED Viewed

	@@ -0,0 +1,697 @@

+import gradio as gr
+import torch
+import cv2
+import numpy as np
+import mediapipe as mp
+import matplotlib.pyplot as plt
+from PIL import Image
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, StableDiffusionControlNetInpaintPipeline
+from transformers import AutoTokenizer
+import base64
+import requests
+import json
+from rembg import remove
+from scipy import ndimage
+from moviepy.editor import ImageSequenceClip
+from tqdm import tqdm
+import os
+import shutil
+import time
+from huggingface_hub import snapshot_download
+import subprocess
+import sys
+def download_liveportrait():
+    """
+    Clone the LivePortrait repository and prepare its dependencies.
+    """
+    liveportrait_path = "./LivePortrait"
+    try:
+        if not os.path.exists(liveportrait_path):
+            print("Cloning LivePortrait repository...")
+            os.system(f"git clone https://github.com/KwaiVGI/LivePortrait.git {liveportrait_path}")
+        # 安装依赖
+        os.chdir(liveportrait_path)
+        print("Installing LivePortrait dependencies...")
+        os.system("pip install -r requirements.txt")
+        # 构建 MultiScaleDeformableAttention 模块
+        dependency_path = "src/utils/dependencies/XPose/models/UniPose/ops"
+        os.chdir(dependency_path)
+        print("Building MultiScaleDeformableAttention...")
+        os.system("python setup.py build")
+        os.system("python setup.py install")
+        # 确保模块路径可用
+        module_path = os.path.abspath(dependency_path)
+        if module_path not in sys.path:
+            sys.path.append(module_path)
+        # 返回 LivePortrait 目录
+        os.chdir("../../../../../../../")
+        print("LivePortrait setup completed")
+    except Exception as e:
+        print("Failed to initialize LivePortrait:", e)
+        raise
+def download_huggingface_resources():
+    """
+    Download additional necessary resources from Hugging Face using the CLI.
+    """
+    try:
+        local_dir = "./pretrained_weights"
+        os.makedirs(local_dir, exist_ok=True)
+        # Use the Hugging Face CLI for downloading
+        cmd = [
+            "huggingface-cli", "download",
+            "KwaiVGI/LivePortrait",
+            "--local-dir", local_dir,
+            "--exclude", "*.git*", "README.md", "docs"
+        ]
+        print("Executing command:", " ".join(cmd))
+        subprocess.run(cmd, check=True)
+        print("Resources successfully downloaded to:", local_dir)
+    except subprocess.CalledProcessError as e:
+        print("Error during Hugging Face CLI download:", e)
+        raise
+    except Exception as e:
+        print("General error in downloading resources:", e)
+        raise
+def get_project_root():
+    """Get the root directory of the current project."""
+    return os.path.abspath(os.path.dirname(__file__))
+# Ensure working directory is project root
+os.chdir(get_project_root())
+# Initialize the necessary models and components
+mp_pose = mp.solutions.pose
+mp_drawing = mp.solutions.drawing_utils
+# Load ControlNet model
+controlnet = ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-openpose', torch_dtype=torch.float16)
+# Load Stable Diffusion model with ControlNet
+pipe_controlnet = StableDiffusionControlNetPipeline.from_pretrained(
+    'runwayml/stable-diffusion-v1-5',
+    controlnet=controlnet,
+    torch_dtype=torch.float16
+)
+# Load Inpaint Controlnet
+pipe_inpaint_controlnet = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    controlnet=controlnet,
+    torch_dtype=torch.float16
+)
+# Move to GPU if available
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+pipe_controlnet.to(device)
+pipe_controlnet.enable_attention_slicing()
+pipe_inpaint_controlnet.to(device)
+pipe_inpaint_controlnet.enable_attention_slicing()
+def resize_to_multiple_of_64(width, height):
+    return (width // 64) * 64, (height // 64) * 64
+def expand_mask(mask, kernel_size):
+    mask_array = np.array(mask)
+    structuring_element = np.ones((kernel_size, kernel_size), dtype=np.uint8)
+    expanded_mask_array = ndimage.binary_dilation(
+        mask_array, structure=structuring_element
+    ).astype(np.uint8) * 255
+    return Image.fromarray(expanded_mask_array)
+def crop_face_to_square(image_rgb, padding_ratio=0.2):
+    """
+    Detects the face in the input image and crops an enlarged square region around it.
+    """
+    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+    gray_image = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
+    faces = face_cascade.detectMultiScale(gray_image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
+    if len(faces) == 0:
+        print("No face detected.")
+        return None
+    x, y, w, h = faces[0]
+    center_x, center_y = x + w // 2, y + h // 2
+    side_length = max(w, h)
+    padded_side_length = int(side_length * (1 + padding_ratio))
+    half_side = padded_side_length // 2
+    top_left_x = max(center_x - half_side, 0)
+    top_left_y = max(center_y - half_side, 0)
+    bottom_right_x = min(center_x + half_side, image_rgb.shape[1])
+    bottom_right_y = min(center_y + half_side, image_rgb.shape[0])
+    cropped_image = image_rgb[top_left_y:bottom_right_y, top_left_x:bottom_right_x]
+    resized_image = cv2.resize(cropped_image, (768, 768), interpolation=cv2.INTER_AREA)
+    return resized_image
+def spirit_animal_baseline(image_path, num_images = 4):
+    image = cv2.imread(image_path)
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image_rgb = crop_face_to_square(image_rgb)
+    original_height, original_width, _ = image_rgb.shape
+    aspect_ratio = original_width / original_height
+    if aspect_ratio > 1:
+        gen_width = 768
+        gen_height = int(gen_width / aspect_ratio)
+    else:
+        gen_height = 768
+        gen_width = int(gen_height * aspect_ratio)
+    gen_width, gen_height = resize_to_multiple_of_64(gen_width, gen_height)
+    with mp_pose.Pose(static_image_mode=True) as pose:
+        results = pose.process(image_rgb)
+        if results.pose_landmarks:
+            annotated_image = image_rgb.copy()
+            mp_drawing.draw_landmarks(
+                annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS
+            )
+        else:
+            print("No pose detected.")
+            return "No pose detected.", []
+    pose_image = np.zeros_like(image_rgb)
+    for connection in mp_pose.POSE_CONNECTIONS:
+        start_idx, end_idx = connection
+        start, end = results.pose_landmarks.landmark[start_idx], results.pose_landmarks.landmark[end_idx]
+        if start.visibility > 0.5 and end.visibility > 0.5:
+            x1, y1 = int(start.x * pose_image.shape[1]), int(start.y * pose_image.shape[0])
+            x2, y2 = int(end.x * pose_image.shape[1]), int(end.y * pose_image.shape[0])
+            cv2.line(pose_image, (x1, y1), (x2, y2), (255, 255, 255), 2)
+    pose_pil = Image.fromarray(cv2.resize(pose_image, (gen_width, gen_height), interpolation=cv2.INTER_LANCZOS4))
+    base64_image = base64.b64encode(cv2.imencode('.jpg', image_rgb)[1]).decode()
+    api_key = "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"
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    payload = {
+        "model": "gpt-4o-mini",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Based on the provided image, think of one spirit animal that is right for the person, and answer in the following format: An ultra-realistic, highly detailed photograph of a single {animal} with facial features characterized by {description}, standing upright in a human-like pose, looking directly at the camera, against a solid, neutral background. Generate one sentence without any other responses or numbering."},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                ]
+            }
+        ],
+        "max_tokens": 100
+    }
+    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+    prompt = response.json()['choices'][0]['message']['content'] if 'choices' in response.json() else "A majestic animal"
+    num_images = num_images
+    generated_images = []
+    with torch.no_grad():
+        with torch.autocast(device_type=device.type):
+            for _ in range(num_images):
+                images = pipe_controlnet(
+                    prompt=prompt,
+                    negative_prompt="multiple heads, extra limbs, duplicate faces, mutated anatomy, disfigured, blurry",
+                    num_inference_steps=20,
+                    image=pose_pil,
+                    guidance_scale=5,
+                    width=gen_width,
+                    height=gen_height,
+                ).images
+                generated_images.append(images[0])
+    return prompt, generated_images
+def spirit_animal_with_background(image_path, num_images = 4):
+    image = cv2.imread(image_path)
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    # image_rgb = crop_face_to_square(image_rgb)
+    original_height, original_width, _ = image_rgb.shape
+    aspect_ratio = original_width / original_height
+    if aspect_ratio > 1:
+        gen_width = 768
+        gen_height = int(gen_width / aspect_ratio)
+    else:
+        gen_height = 768
+        gen_width = int(gen_height * aspect_ratio)
+    gen_width, gen_height = resize_to_multiple_of_64(gen_width, gen_height)
+    with mp_pose.Pose(static_image_mode=True) as pose:
+        results = pose.process(image_rgb)
+        if results.pose_landmarks:
+            annotated_image = image_rgb.copy()
+            mp_drawing.draw_landmarks(
+                annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS
+            )
+        else:
+            print("No pose detected.")
+            return "No pose detected.", []
+    pose_image = np.zeros_like(image_rgb)
+    for connection in mp_pose.POSE_CONNECTIONS:
+        start_idx, end_idx = connection
+        start, end = results.pose_landmarks.landmark[start_idx], results.pose_landmarks.landmark[end_idx]
+        if start.visibility > 0.5 and end.visibility > 0.5:
+            x1, y1 = int(start.x * pose_image.shape[1]), int(start.y * pose_image.shape[0])
+            x2, y2 = int(end.x * pose_image.shape[1]), int(end.y * pose_image.shape[0])
+            cv2.line(pose_image, (x1, y1), (x2, y2), (255, 255, 255), 2)
+    pose_pil = Image.fromarray(cv2.resize(pose_image, (gen_width, gen_height), interpolation=cv2.INTER_LANCZOS4))
+    base64_image = base64.b64encode(cv2.imencode('.jpg', image_rgb)[1]).decode()
+    api_key = "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"
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    payload = {
+        "model": "gpt-4o-mini",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Based on the provided image, think of one spirit animal that is right for the person, and answer in the following format: An ultra-realistic, highly detailed photograph of a single {animal} with facial features characterized by {description}, standing upright in a human-like pose, looking directly at the camera, against a solid, neutral background. Generate one sentence without any other responses or numbering."},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                ]
+            }
+        ],
+        "max_tokens": 100
+    }
+    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+    prompt = response.json()['choices'][0]['message']['content'] if 'choices' in response.json() else "A majestic animal"
+    mask_image = remove(Image.fromarray(image_rgb))
+    initial_mask = mask_image.split()[-1].convert('L')
+    kernel_size = min(gen_width, gen_height) // 15
+    expanded_mask = expand_mask(initial_mask, kernel_size)
+    num_images = num_images
+    generated_images = []
+    with torch.no_grad():
+        with torch.autocast(device_type=device.type):
+            for _ in range(num_images):
+                images = pipe_inpaint_controlnet(
+                    prompt=prompt,
+                    negative_prompt="multiple heads, extra limbs, duplicate faces, mutated anatomy, disfigured, blurry",
+                    num_inference_steps=20,
+                    image=Image.fromarray(image_rgb),
+                    mask_image=expanded_mask,
+                    control_image=pose_pil,
+                    width=gen_width,
+                    height=gen_height,
+                    guidance_scale=5,
+                ).images
+                generated_images.append(images[0])
+    return prompt, generated_images
+def generate_multiple_animals(image_path, keep_background=True, num_images = 4):
+    image = cv2.imread(image_path)
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image_rgb = crop_face_to_square(image_rgb)
+    original_image = Image.fromarray(image_rgb)
+    original_width, original_height = original_image.size
+    aspect_ratio = original_width / original_height
+    if aspect_ratio > 1:
+        gen_width = 768
+        gen_height = int(gen_width / aspect_ratio)
+    else:
+        gen_height = 768
+        gen_width = int(gen_height * aspect_ratio)
+    gen_width, gen_height = resize_to_multiple_of_64(gen_width, gen_height)
+    base64_image = base64.b64encode(cv2.imencode('.jpg', image_rgb)[1]).decode()
+    api_key = "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"
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    payload = {
+        "model": "gpt-4o-mini",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Based on the provided image, think of " + str(num_images) + " different spirit animals that are right for the person, and answer in the following format for each: An ultra-realistic, highly detailed photograph of a {animal} with facial features characterized by {description}, standing upright in a human-like pose, looking directly at the camera, against a solid, neutral background. Generate these sentences without any other responses or numbering. For the animal choose between owl, bear, fox, koala, lion, dog"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 500
+    }
+    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+    response_json = response.json()
+    if 'choices' in response_json and len(response_json['choices']) > 0:
+      content = response_json['choices'][0]['message']['content']
+      prompts = [prompt.strip() for prompt in content.strip().split('.') if prompt.strip()]
+      negative_prompt = (
+          "multiple heads, extra limbs, duplicate faces, mutated anatomy, disfigured, "
+          "blurry, deformed, text, watermark, logo, low resolution"
+      )
+      formatted_prompts = "\n".join(f"{i+1}. {prompt}" for i, prompt in enumerate(prompts))
+    with mp_pose.Pose(static_image_mode=True) as pose:
+        results = pose.process(image_rgb)
+        if results.pose_landmarks:
+            annotated_image = image_rgb.copy()
+            mp_drawing.draw_landmarks(
+                annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS
+            )
+        else:
+            print("No pose detected.")
+            return "No pose detected.", []
+    pose_image = np.zeros_like(image_rgb)
+    for connection in mp_pose.POSE_CONNECTIONS:
+        start_idx, end_idx = connection
+        start, end = results.pose_landmarks.landmark[start_idx], results.pose_landmarks.landmark[end_idx]
+        if start.visibility > 0.5 and end.visibility > 0.5:
+            x1, y1 = int(start.x * pose_image.shape[1]), int(start.y * pose_image.shape[0])
+            x2, y2 = int(end.x * pose_image.shape[1]), int(end.y * pose_image.shape[0])
+            cv2.line(pose_image, (x1, y1), (x2, y2), (255, 255, 255), 2)
+    pose_pil = Image.fromarray(cv2.resize(pose_image, (gen_width, gen_height), interpolation=cv2.INTER_LANCZOS4))
+    if keep_background:
+        mask_image = remove(original_image)
+        initial_mask = mask_image.split()[-1].convert('L')
+        expanded_mask = expand_mask(initial_mask, kernel_size=min(gen_width, gen_height) // 15)
+    else:
+        expanded_mask = None
+    generated_images = []
+    if keep_background:
+        with torch.no_grad():
+            with torch.amp.autocast("cuda"):
+                for prompt in prompts:
+                    images = pipe_inpaint_controlnet(
+                        prompt=prompt,
+                        negative_prompt=negative_prompt,
+                        num_inference_steps=20,
+                        image=Image.fromarray(image_rgb),
+                        mask_image=expanded_mask,
+                        control_image=pose_pil,
+                        width=gen_width,
+                        height=gen_height,
+                        guidance_scale=5,
+                    ).images
+                    generated_images.append(images[0])
+    else:
+        with torch.no_grad():
+            with torch.amp.autocast("cuda"):
+                for prompt in prompts:
+                    images = pipe_controlnet(
+                        prompt=prompt,
+                        negative_prompt=negative_prompt,
+                        num_inference_steps=20,
+                        image=pose_pil,
+                        guidance_scale=5,
+                        width=gen_width,
+                        height=gen_height,
+                    ).images
+                    generated_images.append(images[0])
+    return formatted_prompts, generated_images
+def wait_for_file(file_path, timeout=500):
+    """
+    Wait for a file to be created, with a specified timeout.
+    Args:
+        file_path (str): The path of the file to wait for.
+        timeout (int): Maximum time to wait in seconds.
+    Returns:
+        bool: True if the file is created, False if timeout occurs.
+    """
+    start_time = time.time()
+    while not os.path.exists(file_path):
+        if time.time() - start_time > timeout:
+            return False
+        time.sleep(0.5)  # Check every 0.5 seconds
+    return True
+def generate_spirit_animal_video(driving_video_path):
+    os.chdir(".")
+    try:
+        # Step 1: Extract the first frame
+        cap = cv2.VideoCapture(driving_video_path)
+        if not cap.isOpened():
+            print("Error: Unable to open video.")
+            return None
+        ret, frame = cap.read()
+        cap.release()
+        if not ret:
+            print("Error: Unable to read the first frame.")
+            return None
+        # Save the first frame
+        first_frame_path = "./first_frame.jpg"
+        cv2.imwrite(first_frame_path, frame)
+        print(f"First frame saved to: {first_frame_path}")
+        # Generate spirit animal image
+        _, input_image = generate_multiple_animals(first_frame_path, True, 1)
+        if input_image is None or not input_image:
+            print("Error: Spirit animal generation failed.")
+            return None
+        spirit_animal_path = "./animal.jpeg"
+        cv2.imwrite(spirit_animal_path, cv2.cvtColor(np.array(input_image[0]), cv2.COLOR_RGB2BGR))
+        print(f"Spirit animal image saved to: {spirit_animal_path}")
+        # Step 3: Run inference
+        output_path = "./animations/animal--uploaded_video_compressed.mp4"
+        script_path = os.path.abspath("../LivePortrait/inference_animals.py")
+        if not os.path.exists(script_path):
+            print(f"Error: Inference script not found at {script_path}.")
+            return None
+        command = f"python {script_path} -s {spirit_animal_path} -d {driving_video_path} --driving_multiplier 1.75 --no_flag_stitching"
+        print(f"Running command: {command}")
+        result = os.system(command)
+        if result != 0:
+            print(f"Error: Command failed with exit code {result}.")
+            return None
+        # Verify output file exists
+        if not os.path.exists(output_path):
+            print(f"Error: Expected output video not found at {output_path}.")
+            return None
+        print(f"Output video generated at: {output_path}")
+        return output_path
+    except Exception as e:
+        print(f"Error occurred: {e}")
+        return None
+def generate_spirit_animal(image, animal_type, background):
+    if animal_type == "Single Animal":
+        if background == "Preserve Background":
+            prompt, generated_images = spirit_animal_with_background(image)
+        else:
+            prompt, generated_images = spirit_animal_baseline(image)
+    elif animal_type == "Multiple Animals":
+        if background == "Preserve Background":
+            prompt, generated_images = generate_multiple_animals(image, keep_background=True)
+        else:
+            prompt, generated_images = generate_multiple_animals(image, keep_background=False)
+    return prompt, generated_images
+def compress_video(input_path, output_path, target_size_mb):
+    target_size_bytes = target_size_mb * 1024 * 1024
+    temp_output = "./temp_compressed.mp4"
+    cap = cv2.VideoCapture(input_path)
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 使用 mp4 编码
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    writer = cv2.VideoWriter(temp_output, fourcc, fps, (width, height))
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        writer.write(frame)
+    cap.release()
+    writer.release()
+    current_size = os.path.getsize(temp_output)
+    if current_size > target_size_bytes:
+        bitrate = int(target_size_bytes * 8 / (current_size / target_size_bytes))  # 按比例缩减比特率
+        os.system(f"ffmpeg -i {temp_output} -b:v {bitrate} -y {output_path}")
+        os.remove(temp_output)
+    else:
+        shutil.move(temp_output, output_path)
+def process_video(video_file):
+    # 初始化 LivePortrait
+    try:
+        download_liveportrait()
+    except Exception as e:
+        print("Failed to initialize LivePortrait:", e)
+        return gr.update(value=None, visible=False)
+        # 下载 Hugging Face 资源
+    try:
+        download_huggingface_resources()
+    except Exception as e:
+        print("Failed to download Hugging Face resources:", e)
+        return gr.update(value=None, visible=False)
+    compressed_path = "./uploaded_video_compressed.mp4"
+    compress_video(video_file, compressed_path, target_size_mb=1)
+    print(f"Compressed and moved video to: {compressed_path}")
+    output_video_path = generate_spirit_animal_video(compressed_path)
+    # Wait until the output video is generated
+    timeout = 6000  # Timeout in seconds
+    if not wait_for_file(output_video_path, timeout=timeout):
+        print("Timeout occurred while waiting for video generation.")
+        return gr.update(value=None, visible=False)  # Hide output if failed
+    # Return the generated video path
+    print(f"Output video is ready: {output_video_path}")
+    return gr.update(value=output_video_path, visible=True)  # Show video
+# Custom CSS styling for the interface
+css = """
+#title-container {
+    font-family: 'Arial', sans-serif;
+    color: #4a4a4a;
+    text-align: center;
+    margin-bottom: 20px;
+}
+#title-container h1 {
+    font-size: 2.5em;
+    font-weight: bold;
+    color: #ff9900;
+}
+#title-container h2 {
+    font-size: 1.2em;
+    color: #6c757d;
+}
+#intro-text {
+    font-size: 1em;
+    color: #6c757d;
+    margin: 50px;
+    text-align: center;
+    font-style: italic;
+}
+#prompt-output {
+    font-family: 'Courier New', monospace;
+    color: #5a5a5a;
+    font-size: 1.1em;
+    padding: 10px;
+    background-color: #f9f9f9;
+    border: 1px solid #ddd;
+    border-radius: 5px;
+    margin-top: 10px;
+}
+"""
+# Title and description
+title_html = """
+<div id="title-container">
+    <h1>Spirit Animal Generator</h1>
+    <h2>Create your unique spirit animal with AI-assisted image generation.</h2>
+</div>
+"""
+description_text = """
+### Project Overview
+Welcome to the Spirit Animal Generator! This tool leverages advanced AI technologies to create unique visualizations of spirit animals from both videos and images.
+#### Key Features:
+1. **Video Transformation**: Upload a driving video to generate a creative spirit animal animation.
+2. **Image Creation**: Upload an image and customize the spirit animal type and background options.
+3. **AI-Powered Prompting**: OpenAI's GPT generates descriptive prompts for each input.
+4. **High-Quality Outputs**: Generated using Stable Diffusion and ControlNet for stunning visuals.
+---
+### How It Works:
+1. **Upload Your Media**:
+   - Videos: Ensure the file is in MP4 format.
+   - Images: Use clear, high-resolution photos for better results.
+2. **Customize Options**:
+   - For images, select the type of animal and background settings.
+3. **View Your Results**:
+   - Videos will be transformed into animations.
+   - Images will produce customized visual art along with a generated prompt.
+Discover your spirit animal and let your imagination run wild!
+---
+"""
+with gr.Blocks() as demo:
+    gr.HTML(title_html)
+    gr.Markdown(description_text)
+    with gr.Tabs():
+        with gr.Tab("Generate Spirit Animal Image"):
+            gr.Markdown("Upload an image to generate a spirit animal.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    image_input = gr.Image(type="filepath", label="Upload an image")
+                    animal_type = gr.Radio(choices=["Single Animal", "Multiple Animals"], label="Animal Type", value="Single Animal")
+                    background_option = gr.Radio(choices=["Preserve Background", "Don't Preserve Background"], label="Background Option", value="Preserve Background")
+                    generate_image_button = gr.Button("Generate Image")
+                with gr.Column(scale=1):
+                    generated_prompt = gr.Textbox(label="Generated Prompt")
+                    generated_gallery = gr.Gallery(label="Generated Images")
+            generate_image_button.click(
+                fn=generate_spirit_animal,
+                inputs=[image_input, animal_type, background_option],
+                outputs=[generated_prompt, generated_gallery],
+            )
+        with gr.Tab("Generate Spirit Animal Video"):
+            gr.Markdown("Upload a driving video to generate a spirit animal video.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    video_input = gr.Video(label="Upload a driving video (MP4 format)")
+                    generate_video_button = gr.Button("Generate Video")
+                with gr.Column(scale=1):
+                    video_output = gr.Video(label="Generated Spirit Animal Video")
+            generate_video_button.click(
+                fn=process_video,
+                inputs=video_input,
+                outputs=video_output,
+            )
+demo.launch()