|
import gradio as gr |
|
import torch |
|
from huggingface_hub import snapshot_download |
|
|
|
from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder |
|
from xora.models.transformers.transformer3d import Transformer3DModel |
|
from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier |
|
from xora.schedulers.rf import RectifiedFlowScheduler |
|
from xora.pipelines.pipeline_xora_video import XoraVideoPipeline |
|
from transformers import T5EncoderModel, T5Tokenizer |
|
from xora.utils.conditioning_method import ConditioningMethod |
|
from pathlib import Path |
|
import safetensors.torch |
|
import json |
|
import numpy as np |
|
import cv2 |
|
from PIL import Image |
|
import tempfile |
|
import os |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
|
|
model_path = "asset" |
|
if not os.path.exists(model_path): |
|
snapshot_download( |
|
"Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token |
|
) |
|
|
|
|
|
vae_dir = Path(model_path) / "vae" |
|
unet_dir = Path(model_path) / "unet" |
|
scheduler_dir = Path(model_path) / "scheduler" |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
def load_vae(vae_dir): |
|
vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors" |
|
vae_config_path = vae_dir / "config.json" |
|
with open(vae_config_path, "r") as f: |
|
vae_config = json.load(f) |
|
vae = CausalVideoAutoencoder.from_config(vae_config) |
|
vae_state_dict = safetensors.torch.load_file(vae_ckpt_path) |
|
vae.load_state_dict(vae_state_dict) |
|
return vae.cuda().to(torch.bfloat16) |
|
|
|
|
|
def load_unet(unet_dir): |
|
unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors" |
|
unet_config_path = unet_dir / "config.json" |
|
transformer_config = Transformer3DModel.load_config(unet_config_path) |
|
transformer = Transformer3DModel.from_config(transformer_config) |
|
unet_state_dict = safetensors.torch.load_file(unet_ckpt_path) |
|
transformer.load_state_dict(unet_state_dict, strict=True) |
|
return transformer.to(device) |
|
|
|
|
|
def load_scheduler(scheduler_dir): |
|
scheduler_config_path = scheduler_dir / "scheduler_config.json" |
|
scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path) |
|
return RectifiedFlowScheduler.from_config(scheduler_config) |
|
|
|
|
|
|
|
def center_crop_and_resize(frame, target_height, target_width): |
|
h, w, _ = frame.shape |
|
aspect_ratio_target = target_width / target_height |
|
aspect_ratio_frame = w / h |
|
if aspect_ratio_frame > aspect_ratio_target: |
|
new_width = int(h * aspect_ratio_target) |
|
x_start = (w - new_width) // 2 |
|
frame_cropped = frame[:, x_start : x_start + new_width] |
|
else: |
|
new_height = int(w / aspect_ratio_target) |
|
y_start = (h - new_height) // 2 |
|
frame_cropped = frame[y_start : y_start + new_height, :] |
|
frame_resized = cv2.resize(frame_cropped, (target_width, target_height)) |
|
return frame_resized |
|
|
|
|
|
def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768): |
|
image = Image.open(image_path).convert("RGB") |
|
image_np = np.array(image) |
|
frame_resized = center_crop_and_resize(image_np, target_height, target_width) |
|
frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float() |
|
frame_tensor = (frame_tensor / 127.5) - 1.0 |
|
return frame_tensor.unsqueeze(0).unsqueeze(2) |
|
|
|
|
|
|
|
preset_options = [ |
|
{"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41}, |
|
{"label": "1088x704, 49 frames", "width": 1088, "height": 704, "num_frames": 49}, |
|
{"label": "1056x640, 57 frames", "width": 1056, "height": 640, "num_frames": 57}, |
|
{"label": "992x608, 65 frames", "width": 992, "height": 608, "num_frames": 65}, |
|
{"label": "896x608, 73 frames", "width": 896, "height": 608, "num_frames": 73}, |
|
{"label": "896x544, 81 frames", "width": 896, "height": 544, "num_frames": 81}, |
|
{"label": "832x544, 89 frames", "width": 832, "height": 544, "num_frames": 89}, |
|
{"label": "800x512, 97 frames", "width": 800, "height": 512, "num_frames": 97}, |
|
{"label": "768x512, 97 frames", "width": 768, "height": 512, "num_frames": 97}, |
|
{"label": "800x480, 105 frames", "width": 800, "height": 480, "num_frames": 105}, |
|
{"label": "736x480, 113 frames", "width": 736, "height": 480, "num_frames": 113}, |
|
{"label": "704x480, 121 frames", "width": 704, "height": 480, "num_frames": 121}, |
|
{"label": "704x448, 129 frames", "width": 704, "height": 448, "num_frames": 129}, |
|
{"label": "672x448, 137 frames", "width": 672, "height": 448, "num_frames": 137}, |
|
{"label": "640x416, 153 frames", "width": 640, "height": 416, "num_frames": 153}, |
|
{"label": "672x384, 161 frames", "width": 672, "height": 384, "num_frames": 161}, |
|
{"label": "640x384, 169 frames", "width": 640, "height": 384, "num_frames": 169}, |
|
{"label": "608x384, 177 frames", "width": 608, "height": 384, "num_frames": 177}, |
|
{"label": "576x384, 185 frames", "width": 576, "height": 384, "num_frames": 185}, |
|
{"label": "608x352, 193 frames", "width": 608, "height": 352, "num_frames": 193}, |
|
{"label": "576x352, 201 frames", "width": 576, "height": 352, "num_frames": 201}, |
|
{"label": "544x352, 209 frames", "width": 544, "height": 352, "num_frames": 209}, |
|
{"label": "512x352, 225 frames", "width": 512, "height": 352, "num_frames": 225}, |
|
{"label": "512x352, 233 frames", "width": 512, "height": 352, "num_frames": 233}, |
|
{"label": "544x320, 241 frames", "width": 544, "height": 320, "num_frames": 241}, |
|
{"label": "512x320, 249 frames", "width": 512, "height": 320, "num_frames": 249}, |
|
{"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257}, |
|
{"label": "Custom", "height": None, "width": None, "num_frames": None}, |
|
] |
|
|
|
|
|
|
|
def preset_changed(preset): |
|
if preset != "Custom": |
|
selected = next(item for item in preset_options if item["label"] == preset) |
|
return ( |
|
selected["height"], |
|
selected["width"], |
|
selected["num_frames"], |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
) |
|
else: |
|
return ( |
|
None, |
|
None, |
|
None, |
|
gr.update(visible=True), |
|
gr.update(visible=True), |
|
gr.update(visible=True), |
|
) |
|
|
|
|
|
|
|
vae = load_vae(vae_dir) |
|
unet = load_unet(unet_dir) |
|
scheduler = load_scheduler(scheduler_dir) |
|
patchifier = SymmetricPatchifier(patch_size=1) |
|
text_encoder = T5EncoderModel.from_pretrained( |
|
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder" |
|
).to(device) |
|
tokenizer = T5Tokenizer.from_pretrained( |
|
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer" |
|
) |
|
|
|
pipeline = XoraVideoPipeline( |
|
transformer=unet, |
|
patchifier=patchifier, |
|
text_encoder=text_encoder, |
|
tokenizer=tokenizer, |
|
scheduler=scheduler, |
|
vae=vae, |
|
).to(device) |
|
|
|
|
|
def generate_video_from_text( |
|
prompt="", |
|
negative_prompt="", |
|
seed=171198, |
|
num_inference_steps=40, |
|
guidance_scale=3, |
|
height=512, |
|
width=768, |
|
num_frames=121, |
|
frame_rate=25, |
|
progress=gr.Progress(), |
|
): |
|
if len(prompt.strip()) < 50: |
|
raise gr.Error( |
|
"Prompt must be at least 50 characters long. Please provide more details for the best results.", |
|
duration=5, |
|
) |
|
|
|
sample = { |
|
"prompt": prompt, |
|
"prompt_attention_mask": None, |
|
"negative_prompt": negative_prompt, |
|
"negative_prompt_attention_mask": None, |
|
"media_items": None, |
|
} |
|
|
|
generator = torch.Generator(device="cpu").manual_seed(seed) |
|
|
|
def gradio_progress_callback(self, step, timestep, kwargs): |
|
progress((step + 1) / num_inference_steps) |
|
|
|
images = pipeline( |
|
num_inference_steps=num_inference_steps, |
|
num_images_per_prompt=1, |
|
guidance_scale=guidance_scale, |
|
generator=generator, |
|
output_type="pt", |
|
height=height, |
|
width=width, |
|
num_frames=num_frames, |
|
frame_rate=frame_rate, |
|
**sample, |
|
is_video=True, |
|
vae_per_channel_normalize=True, |
|
conditioning_method=ConditioningMethod.FIRST_FRAME, |
|
mixed_precision=True, |
|
callback_on_step_end=gradio_progress_callback, |
|
).images |
|
|
|
output_path = tempfile.mktemp(suffix=".mp4") |
|
print(images.shape) |
|
video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy() |
|
video_np = (video_np * 255).astype(np.uint8) |
|
height, width = video_np.shape[1:3] |
|
out = cv2.VideoWriter( |
|
output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height) |
|
) |
|
for frame in video_np[..., ::-1]: |
|
out.write(frame) |
|
out.release() |
|
|
|
return output_path |
|
|
|
|
|
def generate_video_from_image( |
|
image_path, |
|
prompt="", |
|
negative_prompt="", |
|
seed=171198, |
|
num_inference_steps=40, |
|
guidance_scale=3, |
|
height=512, |
|
width=768, |
|
num_frames=121, |
|
frame_rate=25, |
|
progress=gr.Progress(), |
|
): |
|
if len(prompt.strip()) < 50: |
|
raise gr.Error( |
|
"Prompt must be at least 50 characters long. Please provide more details for the best results.", |
|
duration=5, |
|
) |
|
|
|
if not image_path: |
|
raise gr.Error("Please provide an input image.", duration=5) |
|
|
|
media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device) |
|
|
|
sample = { |
|
"prompt": prompt, |
|
"prompt_attention_mask": None, |
|
"negative_prompt": negative_prompt, |
|
"negative_prompt_attention_mask": None, |
|
"media_items": media_items, |
|
} |
|
|
|
generator = torch.Generator(device="cpu").manual_seed(seed) |
|
|
|
def gradio_progress_callback(self, step, timestep, kwargs): |
|
progress((step + 1) / num_inference_steps) |
|
|
|
images = pipeline( |
|
num_inference_steps=num_inference_steps, |
|
num_images_per_prompt=1, |
|
guidance_scale=guidance_scale, |
|
generator=generator, |
|
output_type="pt", |
|
height=height, |
|
width=width, |
|
num_frames=num_frames, |
|
frame_rate=frame_rate, |
|
**sample, |
|
is_video=True, |
|
vae_per_channel_normalize=True, |
|
conditioning_method=ConditioningMethod.FIRST_FRAME, |
|
mixed_precision=True, |
|
callback_on_step_end=gradio_progress_callback, |
|
).images |
|
|
|
output_path = tempfile.mktemp(suffix=".mp4") |
|
video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy() |
|
video_np = (video_np * 255).astype(np.uint8) |
|
height, width = video_np.shape[1:3] |
|
out = cv2.VideoWriter( |
|
output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height) |
|
) |
|
for frame in video_np[..., ::-1]: |
|
out.write(frame) |
|
out.release() |
|
|
|
return output_path |
|
|
|
|
|
def create_advanced_options(): |
|
with gr.Accordion("Step 4: Advanced Options (Optional)", open=False): |
|
seed = gr.Slider( |
|
label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=171198 |
|
) |
|
inference_steps = gr.Slider( |
|
label="4.2 Inference Steps", minimum=1, maximum=100, step=1, value=40 |
|
) |
|
guidance_scale = gr.Slider( |
|
label="4.3 Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0 |
|
) |
|
|
|
height_slider = gr.Slider( |
|
label="4.4 Height", |
|
minimum=256, |
|
maximum=1024, |
|
step=64, |
|
value=704, |
|
visible=False, |
|
) |
|
width_slider = gr.Slider( |
|
label="4.5 Width", |
|
minimum=256, |
|
maximum=1024, |
|
step=64, |
|
value=1216, |
|
visible=False, |
|
) |
|
num_frames_slider = gr.Slider( |
|
label="4.5 Number of Frames", |
|
minimum=1, |
|
maximum=200, |
|
step=1, |
|
value=41, |
|
visible=False, |
|
) |
|
frame_rate = gr.Slider( |
|
label="4.7 Frame Rate", |
|
minimum=1, |
|
maximum=60, |
|
step=1, |
|
value=25, |
|
visible=False, |
|
) |
|
|
|
return [ |
|
seed, |
|
inference_steps, |
|
guidance_scale, |
|
height_slider, |
|
width_slider, |
|
num_frames_slider, |
|
frame_rate, |
|
] |
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as iface: |
|
with gr.Row(elem_id="title-row"): |
|
gr.Markdown( |
|
""" |
|
<div style="text-align: center; margin-bottom: 1em"> |
|
<h1 style="font-size: 2.5em; font-weight: 600; margin: 0.5em 0;">Video Generation with LTX Video</h1> |
|
</div> |
|
""" |
|
) |
|
with gr.Accordion( |
|
" 📖 Tips for Best Results", open=False, elem_id="instructions-accordion" |
|
): |
|
gr.Markdown( |
|
""" |
|
📝 Prompt Engineering |
|
|
|
When writing prompts, focus on detailed, chronological descriptions of actions and scenes. Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph. Start directly with the action, and keep descriptions literal and precise. Think like a cinematographer describing a shot list. Keep within 200 words. |
|
For best results, build your prompts using this structure: |
|
|
|
- Start with main action in a single sentence |
|
- Add specific details about movements and gestures |
|
- Describe character/object appearances precisely |
|
- Include background and environment details |
|
- Specify camera angles and movements |
|
- Describe lighting and colors |
|
- Note any changes or sudden events |
|
|
|
See examples for more inspiration. |
|
|
|
🎮 Parameter Guide |
|
|
|
- Resolution Preset: Higher resolutions for detailed scenes, lower for faster generation and simpler scenes |
|
- Seed: Save seed values to recreate specific styles or compositions you like |
|
- Guidance Scale: Higher values (5-7) for accurate prompt following, lower values (3-5) for more creative freedom |
|
- Inference Steps: More steps (40+) for quality, fewer steps (20-30) for speed |
|
""" |
|
) |
|
|
|
with gr.Tabs(): |
|
|
|
with gr.TabItem("Text to Video"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
txt2vid_prompt = gr.Textbox( |
|
label="Step 1: Enter Your Prompt", |
|
placeholder="Describe the video you want to generate (minimum 50 characters)...", |
|
value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along.", |
|
lines=5, |
|
) |
|
txt2vid_negative_prompt = gr.Textbox( |
|
label="Step 2: Enter Negative Prompt", |
|
placeholder="Describe what you don't want in the video...", |
|
value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", |
|
lines=2, |
|
) |
|
|
|
txt2vid_preset = gr.Dropdown( |
|
choices=[p["label"] for p in preset_options], |
|
value="1216x704, 41 frames", |
|
label="Step 3: Choose Resolution Preset", |
|
) |
|
|
|
txt2vid_advanced = create_advanced_options() |
|
txt2vid_generate = gr.Button( |
|
"Step 5: Generate Video", |
|
variant="primary", |
|
size="lg", |
|
) |
|
|
|
with gr.Column(): |
|
txt2vid_output = gr.Video(label="Generated Output") |
|
|
|
with gr.Row(): |
|
gr.Examples( |
|
examples=[ |
|
[ |
|
"A woman stirs a pot of boiling water on a white electric burner. Her hands, with purple nail polish, hold a wooden spoon and move it in a circular motion within a white pot filled with bubbling water. The pot sits on a white electric burner with black buttons and a digital display. The burner is positioned on a white countertop with a red and white checkered cloth partially visible in the bottom right corner. The camera angle is a direct overhead shot, remaining stationary throughout the scene. The lighting is bright and even, illuminating the scene with a neutral white light. The scene is real-life footage.", |
|
"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", |
|
"assets/i2v_2.mp4", |
|
], |
|
[ |
|
"A woman in a long, flowing dress stands in a field, her back to the camera, gazing towards the horizon; her hair is long and light, cascading down her back; she stands beneath the sprawling branches of a large oak tree; to her left, a classic American car is parked on the dry grass; in the distance, a wrecked car lies on its side; the sky above is a dramatic canvas of bright white clouds against a darker sky; the entire image is in black and white, emphasizing the contrast of light and shadow. The woman is walking slowly towards the car.", |
|
"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", |
|
"assets/i2v_0.mp4", |
|
], |
|
[ |
|
"A pair of hands shapes a piece of clay on a pottery wheel, gradually forming a cone shape. The hands, belonging to a person out of frame, are covered in clay and gently press a ball of clay onto the center of a spinning pottery wheel. The hands move in a circular motion, gradually forming a cone shape at the top of the clay. The camera is positioned directly above the pottery wheel, providing a bird’s-eye view of the clay being shaped. The lighting is bright and even, illuminating the clay and the hands working on it. The scene is captured in real-life footage.", |
|
"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", |
|
"assets/t2v_0.mp4", |
|
], |
|
], |
|
inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output], |
|
label="Example Text-to-Video Generations", |
|
) |
|
|
|
|
|
with gr.TabItem("Image to Video"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
img2vid_image = gr.Image( |
|
type="filepath", |
|
label="Step 1: Upload Input Image", |
|
elem_id="image_upload", |
|
) |
|
img2vid_prompt = gr.Textbox( |
|
label="Step 2: Enter Your Prompt", |
|
placeholder="Describe how you want to animate the image (minimum 50 characters)...", |
|
value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery...", |
|
lines=5, |
|
) |
|
img2vid_negative_prompt = gr.Textbox( |
|
label="Step 3: Enter Negative Prompt", |
|
placeholder="Describe what you don't want in the video...", |
|
value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", |
|
lines=2, |
|
) |
|
|
|
img2vid_preset = gr.Dropdown( |
|
choices=[p["label"] for p in preset_options], |
|
value="1216x704, 41 frames", |
|
label="Step 4: Choose Resolution Preset", |
|
) |
|
|
|
img2vid_advanced = create_advanced_options() |
|
img2vid_generate = gr.Button( |
|
"Step 6: Generate Video", variant="primary", size="lg" |
|
) |
|
|
|
with gr.Column(): |
|
img2vid_output = gr.Video(label="Generated Output") |
|
|
|
with gr.Row(): |
|
gr.Examples( |
|
examples=[ |
|
[ |
|
"assets/astronaut.jpg", |
|
"An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background.", |
|
"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", |
|
"assets/astronaut_left.mp4", |
|
], |
|
[ |
|
"assets/dancer.jpg", |
|
"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", |
|
"poor quality, jerky motion, blurry", |
|
"assets/dancer_up.mp4", |
|
], |
|
], |
|
inputs=[ |
|
img2vid_image, |
|
img2vid_prompt, |
|
img2vid_negative_prompt, |
|
img2vid_output, |
|
], |
|
label="Example Image-to-Video Generations", |
|
) |
|
|
|
|
|
txt2vid_preset.change( |
|
fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[4:] |
|
) |
|
|
|
txt2vid_generate.click( |
|
fn=generate_video_from_text, |
|
inputs=[txt2vid_prompt, txt2vid_negative_prompt, *txt2vid_advanced], |
|
outputs=txt2vid_output, |
|
concurrency_limit=1, |
|
) |
|
|
|
img2vid_preset.change( |
|
fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[4:] |
|
) |
|
|
|
img2vid_generate.click( |
|
fn=generate_video_from_image, |
|
inputs=[ |
|
img2vid_image, |
|
img2vid_prompt, |
|
img2vid_negative_prompt, |
|
*img2vid_advanced, |
|
], |
|
outputs=img2vid_output, |
|
concurrency_limit=1, |
|
) |
|
|
|
iface.launch(share=True) |
|
|