Spaces:

prs-eth
/

rollingdepth

Running on Zero

File size: 9,211 Bytes

# Copyright 2024 Anton Obukhov, ETH Zurich. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------------------------
# If you find this code useful, we kindly ask you to cite our paper in your work.
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
# More information about the method can be found at https://marigoldmonodepth.github.io
# --------------------------------------------------------------------------

import functools
import os
import sys
import tempfile

import av
import numpy as np

import spaces
import gradio as gr
import torch as torch
import einops

from huggingface_hub import login

from colorize import colorize_depth_multi_thread
from video_io import get_video_fps, write_video_from_numpy

VERBOSE = False
MAX_FRAMES = 100


def process(pipe, device, path_input):
    print(f"Processing {path_input}")

    path_output_dir = tempfile.mkdtemp()
    os.makedirs(path_output_dir, exist_ok=True)

    name_base = os.path.splitext(os.path.basename(path_input))[0]
    path_out_in = os.path.join(path_output_dir, f"{name_base}_depth_input.mp4")
    path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.mp4")

    output_fps = int(get_video_fps(path_input))

    container = av.open(path_input)
    stream = container.streams.video[0]
    fps = float(stream.average_rate)
    duration_sec = float(stream.duration * stream.time_base) if stream.duration else 0
    total_frames = int(duration_sec * fps)
    if total_frames > MAX_FRAMES:
        gr.Warning(
            f"Only the first {MAX_FRAMES} frames (~{MAX_FRAMES / fps:.1f} sec.) will be processed for demonstration; "
            f"use the code from GitHub for full processing"
        )

    generator = torch.Generator(device=device)
    generator.manual_seed(2024)

    pipe_out: RollingDepthOutput = pipe(
        # input setting
        input_video_path=path_input,
        start_frame=0,
        frame_count=min(MAX_FRAMES, total_frames),  # 0 = all
        processing_res=768,
        # infer setting
        dilations=[1, 25],
        cap_dilation=True,
        snippet_lengths=[3],
        init_infer_steps=[1],
        strides=[1],
        coalign_kwargs=None,
        refine_step=0,  # 0 = off
        max_vae_bs=8,  # batch size for encoder/decoder
        # other settings
        generator=generator,
        verbose=VERBOSE,
        # output settings
        restore_res=False,
        unload_snippet=False,
    )

    depth_pred = pipe_out.depth_pred  # [N 1 H W]

    # Colorize results
    cmap = "Spectral_r"
    colored_np = colorize_depth_multi_thread(
        depth=depth_pred.numpy(),
        valid_mask=None,
        chunk_size=4,
        num_threads=4,
        color_map=cmap,
        verbose=VERBOSE,
    )  # [n h w 3], in [0, 255]

    write_video_from_numpy(
        frames=colored_np,
        output_path=path_out_vis,
        fps=output_fps,
        crf=23,
        preset="medium",
        verbose=VERBOSE,
    )

    # Save rgb
    rgb = (pipe_out.input_rgb.numpy() * 255).astype(np.uint8)  # [N 3 H W]
    rgb = einops.rearrange(rgb, "n c h w -> n h w c")
    write_video_from_numpy(
        frames=rgb,
        output_path=path_out_in,
        fps=output_fps,
        crf=23,
        preset="medium",
        verbose=VERBOSE,
    )

    return path_out_in, path_out_vis


def run_demo_server(pipe, device):
    process_pipe = spaces.GPU(functools.partial(process, pipe, device), duration=120)
    os.environ["GRADIO_ALLOW_FLAGGING"] = "never"

    with gr.Blocks(
        analytics_enabled=False,
        title="RollingDepth",
        css="""
            h1 {
                text-align: center;
                display: block;
            }
            h2 {
                text-align: center;
                display: block;
            }
            h3 {
                text-align: center;
                display: block;
            }
        """,
    ) as demo:
        gr.HTML(
            """
            <h1>🛹 RollingDepth 🛹: Video Depth without Video Models</h1>
            <div style="text-align: center; margin-top: 20px;">
                <a title="Website" href="https://rollingdepth.github.io" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;">
                    <img src="https://www.obukhov.ai/img/badges/badge-website.svg" alt="Website Badge">
                </a>
                <a title="arXiv" href="https://arxiv.org/abs/2411.xxxxx" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;">
                    <img src="https://www.obukhov.ai/img/badges/badge-pdf.svg" alt="arXiv Badge">
                </a>
                <a title="GitHub" href="https://github.com/prs-eth/rollingdepth" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;">
                    <img src="https://img.shields.io/github/stars/prs-eth/rollingdepth?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="GitHub Stars Badge">
                </a>
                <a title="Social" href="https://twitter.com/antonobukhov1" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;">
                    <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
                </a>
            </div>
            <p style="margin-top: 20px; text-align: justify;">
                RollingDepth is the state-of-the-art depth estimator for videos in the wild. Upload your video into the 
                <b>left</b> pane, or click any of the <b>examples</b> below. The result preview will be computed and 
                appear in the <b>right</b> panes. For full functionality, use the code on GitHub. 
                <b>TIP:</b> When running out of GPU time, fork the demo.
            </p>
            """
        )

        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                input_video = gr.Video(label="Input Video")
            with gr.Column(scale=2):
                with gr.Row(equal_height=True):
                    output_video_1 = gr.Video(
                        label="Preprocessed video",
                        interactive=False,
                        autoplay=True,
                        loop=True,
                        show_share_button=True,
                        scale=5,
                    )
                    output_video_2 = gr.Video(
                        label="Generated Depth Video",
                        interactive=False,
                        autoplay=True,
                        loop=True,
                        show_share_button=True,
                        scale=5,
                    )

        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                with gr.Row(equal_height=False):
                    generate_btn = gr.Button("Generate")
            with gr.Column(scale=2):
                pass

        gr.Examples(
            examples=[
                ["files/gokart.mp4"],
                ["files/horse.mp4"],
                ["files/walking.mp4"],
            ],
            inputs=[input_video],
            outputs=[output_video_1, output_video_2],
            fn=process_pipe,
            cache_examples=True,
        )

        generate_btn.click(
            fn=process_pipe,
            inputs=[input_video],
            outputs=[output_video_1, output_video_2],
        )

        demo.queue(
            api_open=False,
        ).launch(
            server_name="0.0.0.0",
            server_port=7860,
        )


def main():
    os.system("pip freeze")
    os.system("pip uninstall -y diffusers")
    os.system("pip install rollingdepth_src/diffusers")
    os.system("pip freeze")

    if "HF_TOKEN_LOGIN" in os.environ:
        login(token=os.environ["HF_TOKEN_LOGIN"])

    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    sys.path.append(os.path.join(os.path.dirname(__file__), "rollingdepth_src"))
    from rollingdepth import RollingDepthOutput, RollingDepthPipeline

    pipe: RollingDepthPipeline = RollingDepthPipeline.from_pretrained(
        "prs-eth/rollingdepth-v1-0",
        torch_dtype=torch.float16,
    )
    pipe.set_progress_bar_config(disable=True)

    try:
        import xformers

        pipe.enable_xformers_memory_efficient_attention()
    except:
        pass  # run without xformers

    pipe = pipe.to(device)

    run_demo_server(pipe, device)


if __name__ == "__main__":
    main()