{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import torch\n",
"import numpy as np\n",
"import PIL\n",
"from PIL import Image\n",
"from IPython.display import HTML\n",
"from pyramid_dit import PyramidDiTForVideoGeneration\n",
"from IPython.display import Image as ipython_image\n",
"from diffusers.utils import load_image, export_to_video, export_to_gif"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# variant='diffusion_transformer_384p' # For low resolution variant\n",
"variant='diffusion_transformer_768p' # For high resolution variant\n",
"\n",
"model_name = \"pyramid_flux\" # select the model \"pyramid_flux\" or \"pyramid_mmdit\"\n",
"\n",
"model_path = \"/home/jinyang06/models/pyramid-flow-miniflux\" # The downloaded checkpoint dir\n",
"model_dtype = 'bf16'\n",
"\n",
"device_id = 0\n",
"torch.cuda.set_device(device_id)\n",
"\n",
"model = PyramidDiTForVideoGeneration(\n",
" model_path,\n",
" model_dtype,\n",
" model_name=model_name,\n",
" model_variant=variant,\n",
")\n",
"\n",
"model.vae.to(\"cuda\")\n",
"model.dit.to(\"cuda\")\n",
"model.text_encoder.to(\"cuda\")\n",
"\n",
"model.vae.enable_tiling()\n",
"\n",
"if model_dtype == \"bf16\":\n",
" torch_dtype = torch.bfloat16 \n",
"elif model_dtype == \"fp16\":\n",
" torch_dtype = torch.float16\n",
"else:\n",
" torch_dtype = torch.float32\n",
"\n",
"\n",
"def resize_crop_image(img: PIL.Image.Image, tgt_width, tgt_height):\n",
" ori_width, ori_height = img.width, img.height\n",
" scale = max(tgt_width / ori_width, tgt_height / ori_height)\n",
" resized_width = round(ori_width * scale)\n",
" resized_height = round(ori_height * scale)\n",
" img = img.resize((resized_width, resized_height), resample=PIL.Image.LANCZOS)\n",
"\n",
" left = (resized_width - tgt_width) / 2\n",
" top = (resized_height - tgt_height) / 2\n",
" right = (resized_width + tgt_width) / 2\n",
" bottom = (resized_height + tgt_height) / 2\n",
"\n",
" # Crop the center of the image\n",
" img = img.crop((left, top, right, bottom))\n",
" \n",
" return img\n",
"\n",
"\n",
"def show_video(ori_path, rec_path, width=\"100%\"):\n",
" html = ''\n",
" if ori_path is not None:\n",
" html += f\"\"\"\n",
" \"\"\"\n",
" \n",
" html += f\"\"\"\n",
" \"\"\"\n",
" return HTML(html)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Text-to-Video"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompt = \"A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors\"\n",
"\n",
"# used for 384p model variant\n",
"# width = 640\n",
"# height = 384\n",
"\n",
"# used for 768p model variant\n",
"width = 1280\n",
"height = 768\n",
"\n",
"temp = 16 # temp in [1, 31] <=> frame in [1, 241] <=> duration in [0, 10s]\n",
"# Noting that, for the 384p version, only supports maximum 5s generation (temp = 16)\n",
"\n",
"with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):\n",
" frames = model.generate(\n",
" prompt=prompt,\n",
" num_inference_steps=[20, 20, 20],\n",
" video_num_inference_steps=[10, 10, 10],\n",
" height=height,\n",
" width=width,\n",
" temp=temp,\n",
" guidance_scale=7.0, # The guidance for the first frame, set it to 7 for 384p variant\n",
" video_guidance_scale=5.0, # The guidance for the other video latent\n",
" output_type=\"pil\",\n",
" save_memory=True, # If you have enough GPU memory, set it to `False` to improve vae decoding speed\n",
" )\n",
"\n",
"export_to_video(frames, \"./text_to_video_sample.mp4\", fps=24)\n",
"show_video(None, \"./text_to_video_sample.mp4\", \"70%\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Image-to-Video"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_path = 'assets/the_great_wall.jpg'\n",
"image = Image.open(image_path).convert(\"RGB\")\n",
"\n",
"# used for 384p model variant\n",
"# width = 640\n",
"# height = 384\n",
"\n",
"# used for 768p model variant\n",
"width = 1280\n",
"height = 768\n",
"\n",
"temp = 16\n",
"image = image.resize((width, height))\n",
"image = resize_crop_image(image, width, height)\n",
"\n",
"display(image)\n",
"\n",
"prompt = \"FPV flying over the Great Wall\"\n",
"\n",
"with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):\n",
" frames = model.generate_i2v(\n",
" prompt=prompt,\n",
" input_image=image,\n",
" num_inference_steps=[10, 10, 10],\n",
" temp=temp,\n",
" guidance_scale=7.0,\n",
" video_guidance_scale=4.0,\n",
" output_type=\"pil\",\n",
" save_memory=True, # If you have enough GPU memory, set it to `False` to improve vae decoding speed\n",
" )\n",
"\n",
"export_to_video(frames, \"./image_to_video_sample.mp4\", fps=24)\n",
"show_video(None, \"./image_to_video_sample.mp4\", \"70%\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}