{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import torch\n",
    "import numpy as np\n",
    "import PIL\n",
    "from PIL import Image\n",
    "from IPython.display import HTML\n",
    "from pyramid_dit import PyramidDiTForVideoGeneration\n",
    "from IPython.display import Image as ipython_image\n",
    "from diffusers.utils import load_image, export_to_video, export_to_gif"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# variant='diffusion_transformer_384p'       # For low resolution variant\n",
    "variant='diffusion_transformer_768p'     # For high resolution variant\n",
    "\n",
    "model_name = \"pyramid_flux\"   # select the model \"pyramid_flux\" or \"pyramid_mmdit\"\n",
    "\n",
    "model_path = \"/home/jinyang06/models/pyramid-flow-miniflux\"   # The downloaded checkpoint dir\n",
    "model_dtype = 'bf16'\n",
    "\n",
    "device_id = 0\n",
    "torch.cuda.set_device(device_id)\n",
    "\n",
    "model = PyramidDiTForVideoGeneration(\n",
    "    model_path,\n",
    "    model_dtype,\n",
    "    model_name=model_name,\n",
    "    model_variant=variant,\n",
    ")\n",
    "\n",
    "model.vae.to(\"cuda\")\n",
    "model.dit.to(\"cuda\")\n",
    "model.text_encoder.to(\"cuda\")\n",
    "\n",
    "model.vae.enable_tiling()\n",
    "\n",
    "if model_dtype == \"bf16\":\n",
    "    torch_dtype = torch.bfloat16 \n",
    "elif model_dtype == \"fp16\":\n",
    "    torch_dtype = torch.float16\n",
    "else:\n",
    "    torch_dtype = torch.float32\n",
    "\n",
    "\n",
    "def resize_crop_image(img: PIL.Image.Image, tgt_width, tgt_height):\n",
    "    ori_width, ori_height = img.width, img.height\n",
    "    scale = max(tgt_width / ori_width, tgt_height / ori_height)\n",
    "    resized_width = round(ori_width * scale)\n",
    "    resized_height = round(ori_height * scale)\n",
    "    img = img.resize((resized_width, resized_height), resample=PIL.Image.LANCZOS)\n",
    "\n",
    "    left = (resized_width - tgt_width) / 2\n",
    "    top = (resized_height - tgt_height) / 2\n",
    "    right = (resized_width + tgt_width) / 2\n",
    "    bottom = (resized_height + tgt_height) / 2\n",
    "\n",
    "    # Crop the center of the image\n",
    "    img = img.crop((left, top, right, bottom))\n",
    "    \n",
    "    return img\n",
    "\n",
    "\n",
    "def show_video(ori_path, rec_path, width=\"100%\"):\n",
    "    html = ''\n",
    "    if ori_path is not None:\n",
    "        html += f\"\"\"<video controls=\"\" name=\"media\" data-fullscreen-container=\"true\" width=\"{width}\">\n",
    "        <source src=\"{ori_path}\" type=\"video/mp4\">\n",
    "        </video>\n",
    "        \"\"\"\n",
    "    \n",
    "    html += f\"\"\"<video controls=\"\" name=\"media\" data-fullscreen-container=\"true\" width=\"{width}\">\n",
    "    <source src=\"{rec_path}\" type=\"video/mp4\">\n",
    "    </video>\n",
    "    \"\"\"\n",
    "    return HTML(html)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Text-to-Video"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = \"A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors\"\n",
    "\n",
    "# used for 384p model variant\n",
    "# width = 640\n",
    "# height = 384\n",
    "\n",
    "# used for 768p model variant\n",
    "width = 1280\n",
    "height = 768\n",
    "\n",
    "temp = 16   # temp in [1, 31] <=> frame in [1, 241] <=> duration in [0, 10s]\n",
    "# Noting that, for the 384p version, only supports maximum 5s generation (temp = 16)\n",
    "\n",
    "with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):\n",
    "    frames = model.generate(\n",
    "        prompt=prompt,\n",
    "        num_inference_steps=[20, 20, 20],\n",
    "        video_num_inference_steps=[10, 10, 10],\n",
    "        height=height,\n",
    "        width=width,\n",
    "        temp=temp,\n",
    "        guidance_scale=7.0,         # The guidance for the first frame, set it to 7 for 384p variant\n",
    "        video_guidance_scale=5.0,   # The guidance for the other video latent\n",
    "        output_type=\"pil\",\n",
    "        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed\n",
    "    )\n",
    "\n",
    "export_to_video(frames, \"./text_to_video_sample.mp4\", fps=24)\n",
    "show_video(None, \"./text_to_video_sample.mp4\", \"70%\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Image-to-Video"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_path = 'assets/the_great_wall.jpg'\n",
    "image = Image.open(image_path).convert(\"RGB\")\n",
    "\n",
    "# used for 384p model variant\n",
    "# width = 640\n",
    "# height = 384\n",
    "\n",
    "# used for 768p model variant\n",
    "width = 1280\n",
    "height = 768\n",
    "\n",
    "temp = 16\n",
    "image = image.resize((width, height))\n",
    "image = resize_crop_image(image, width, height)\n",
    "\n",
    "display(image)\n",
    "\n",
    "prompt = \"FPV flying over the Great Wall\"\n",
    "\n",
    "with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):\n",
    "    frames = model.generate_i2v(\n",
    "        prompt=prompt,\n",
    "        input_image=image,\n",
    "        num_inference_steps=[10, 10, 10],\n",
    "        temp=temp,\n",
    "        guidance_scale=7.0,\n",
    "        video_guidance_scale=4.0,\n",
    "        output_type=\"pil\",\n",
    "        save_memory=True,         # If you have enough GPU memory, set it to `False` to improve vae decoding speed\n",
    "    )\n",
    "\n",
    "export_to_video(frames, \"./image_to_video_sample.mp4\", fps=24)\n",
    "show_video(None, \"./image_to_video_sample.mp4\", \"70%\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}