ai-tube-model-ltxv-1

Paused

App Files Files Community

ai-tube-model-ltxv-1 / app.py

benibraz

Remove num_images_per_prompt parameter from video generation functions

1697f47 23 days ago

raw

history blame

18.4 kB

	import gradio as gr
	import torch
	from huggingface_hub import snapshot_download

	from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
	from xora.models.transformers.transformer3d import Transformer3DModel
	from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
	from xora.schedulers.rf import RectifiedFlowScheduler
	from xora.pipelines.pipeline_xora_video import XoraVideoPipeline
	from transformers import T5EncoderModel, T5Tokenizer
	from xora.utils.conditioning_method import ConditioningMethod
	from pathlib import Path
	import safetensors.torch
	import json
	import numpy as np
	import cv2
	from PIL import Image
	import tempfile
	import os

	# Load Hugging Face token if needed
	hf_token = os.getenv("HF_TOKEN")

	# Set model download directory within Hugging Face Spaces
	model_path = "asset"
	if not os.path.exists(model_path):
	snapshot_download(
	"Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token
	)

	# Global variables to load components
	vae_dir = Path(model_path) / "vae"
	unet_dir = Path(model_path) / "unet"
	scheduler_dir = Path(model_path) / "scheduler"

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def load_vae(vae_dir):
	vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
	vae_config_path = vae_dir / "config.json"
	with open(vae_config_path, "r") as f:
	vae_config = json.load(f)
	vae = CausalVideoAutoencoder.from_config(vae_config)
	vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
	vae.load_state_dict(vae_state_dict)
	return vae.cuda().to(torch.bfloat16)


	def load_unet(unet_dir):
	unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
	unet_config_path = unet_dir / "config.json"
	transformer_config = Transformer3DModel.load_config(unet_config_path)
	transformer = Transformer3DModel.from_config(transformer_config)
	unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
	transformer.load_state_dict(unet_state_dict, strict=True)
	return transformer.to(device)


	def load_scheduler(scheduler_dir):
	scheduler_config_path = scheduler_dir / "scheduler_config.json"
	scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
	return RectifiedFlowScheduler.from_config(scheduler_config)


	# Helper function for image processing
	def center_crop_and_resize(frame, target_height, target_width):
	h, w, _ = frame.shape
	aspect_ratio_target = target_width / target_height
	aspect_ratio_frame = w / h
	if aspect_ratio_frame > aspect_ratio_target:
	new_width = int(h * aspect_ratio_target)
	x_start = (w - new_width) // 2
	frame_cropped = frame[:, x_start : x_start + new_width]
	else:
	new_height = int(w / aspect_ratio_target)
	y_start = (h - new_height) // 2
	frame_cropped = frame[y_start : y_start + new_height, :]
	frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
	return frame_resized


	def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
	image = Image.open(image_path).convert("RGB")
	image_np = np.array(image)
	frame_resized = center_crop_and_resize(image_np, target_height, target_width)
	frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float()
	frame_tensor = (frame_tensor / 127.5) - 1.0
	return frame_tensor.unsqueeze(0).unsqueeze(2)


	# Preset options for resolution and frame configuration
	preset_options = [
	{"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
	{"label": "1088x704, 49 frames", "width": 1088, "height": 704, "num_frames": 49},
	{"label": "1056x640, 57 frames", "width": 1056, "height": 640, "num_frames": 57},
	{"label": "992x608, 65 frames", "width": 992, "height": 608, "num_frames": 65},
	{"label": "896x608, 73 frames", "width": 896, "height": 608, "num_frames": 73},
	{"label": "896x544, 81 frames", "width": 896, "height": 544, "num_frames": 81},
	{"label": "832x544, 89 frames", "width": 832, "height": 544, "num_frames": 89},
	{"label": "800x512, 97 frames", "width": 800, "height": 512, "num_frames": 97},
	{"label": "768x512, 97 frames", "width": 768, "height": 512, "num_frames": 97},
	{"label": "800x480, 105 frames", "width": 800, "height": 480, "num_frames": 105},
	{"label": "736x480, 113 frames", "width": 736, "height": 480, "num_frames": 113},
	{"label": "704x480, 121 frames", "width": 704, "height": 480, "num_frames": 121},
	{"label": "704x448, 129 frames", "width": 704, "height": 448, "num_frames": 129},
	{"label": "672x448, 137 frames", "width": 672, "height": 448, "num_frames": 137},
	{"label": "640x416, 153 frames", "width": 640, "height": 416, "num_frames": 153},
	{"label": "672x384, 161 frames", "width": 672, "height": 384, "num_frames": 161},
	{"label": "640x384, 169 frames", "width": 640, "height": 384, "num_frames": 169},
	{"label": "608x384, 177 frames", "width": 608, "height": 384, "num_frames": 177},
	{"label": "576x384, 185 frames", "width": 576, "height": 384, "num_frames": 185},
	{"label": "608x352, 193 frames", "width": 608, "height": 352, "num_frames": 193},
	{"label": "576x352, 201 frames", "width": 576, "height": 352, "num_frames": 201},
	{"label": "544x352, 209 frames", "width": 544, "height": 352, "num_frames": 209},
	{"label": "512x352, 225 frames", "width": 512, "height": 352, "num_frames": 225},
	{"label": "512x352, 233 frames", "width": 512, "height": 352, "num_frames": 233},
	{"label": "544x320, 241 frames", "width": 544, "height": 320, "num_frames": 241},
	{"label": "512x320, 249 frames", "width": 512, "height": 320, "num_frames": 249},
	{"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
	{"label": "Custom", "height": None, "width": None, "num_frames": None},
	]


	# Function to toggle visibility of sliders based on preset selection
	def preset_changed(preset):
	if preset != "Custom":
	selected = next(item for item in preset_options if item["label"] == preset)
	return (
	selected["height"],
	selected["width"],
	selected["num_frames"],
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	)
	else:
	return (
	None,
	None,
	None,
	gr.update(visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	)


	# Load models
	vae = load_vae(vae_dir)
	unet = load_unet(unet_dir)
	scheduler = load_scheduler(scheduler_dir)
	patchifier = SymmetricPatchifier(patch_size=1)
	text_encoder = T5EncoderModel.from_pretrained(
	"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
	).to(device)
	tokenizer = T5Tokenizer.from_pretrained(
	"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
	)

	pipeline = XoraVideoPipeline(
	transformer=unet,
	patchifier=patchifier,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	scheduler=scheduler,
	vae=vae,
	).to(device)


	def generate_video_from_text(
	prompt="",
	negative_prompt="",
	seed=171198,
	num_inference_steps=40,
	guidance_scale=3,
	height=512,
	width=768,
	num_frames=121,
	frame_rate=25,
	progress=gr.Progress(),
	):
	if len(prompt.strip()) < 50:
	raise gr.Error(
	"Prompt must be at least 50 characters long. Please provide more details for the best results.",
	duration=5,
	)

	sample = {
	"prompt": prompt,
	"prompt_attention_mask": None,
	"negative_prompt": negative_prompt,
	"negative_prompt_attention_mask": None,
	"media_items": None,
	}

	generator = torch.Generator(device="cpu").manual_seed(seed)

	def gradio_progress_callback(self, step, timestep, kwargs):
	progress((step + 1) / num_inference_steps)

	images = pipeline(
	num_inference_steps=num_inference_steps,
	num_images_per_prompt=1,
	guidance_scale=guidance_scale,
	generator=generator,
	output_type="pt",
	height=height,
	width=width,
	num_frames=num_frames,
	frame_rate=frame_rate,
	**sample,
	is_video=True,
	vae_per_channel_normalize=True,
	conditioning_method=ConditioningMethod.FIRST_FRAME,
	mixed_precision=True,
	callback_on_step_end=gradio_progress_callback,
	).images

	output_path = tempfile.mktemp(suffix=".mp4")
	print(images.shape)
	video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
	video_np = (video_np * 255).astype(np.uint8)
	height, width = video_np.shape[1:3]
	out = cv2.VideoWriter(
	output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
	)
	for frame in video_np[..., ::-1]:
	out.write(frame)
	out.release()

	return output_path


	def generate_video_from_image(
	image_path,
	prompt="",
	negative_prompt="",
	seed=171198,
	num_inference_steps=40,
	guidance_scale=3,
	height=512,
	width=768,
	num_frames=121,
	frame_rate=25,
	progress=gr.Progress(),
	):
	if len(prompt.strip()) < 50:
	raise gr.Error(
	"Prompt must be at least 50 characters long. Please provide more details for the best results.",
	duration=5,
	)

	if not image_path:
	raise gr.Error("Please provide an input image.", duration=5)

	media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device)

	sample = {
	"prompt": prompt,
	"prompt_attention_mask": None,
	"negative_prompt": negative_prompt,
	"negative_prompt_attention_mask": None,
	"media_items": media_items,
	}

	generator = torch.Generator(device="cpu").manual_seed(seed)

	def gradio_progress_callback(self, step, timestep, kwargs):
	progress((step + 1) / num_inference_steps)

	images = pipeline(
	num_inference_steps=num_inference_steps,
	num_images_per_prompt=1,
	guidance_scale=guidance_scale,
	generator=generator,
	output_type="pt",
	height=height,
	width=width,
	num_frames=num_frames,
	frame_rate=frame_rate,
	**sample,
	is_video=True,
	vae_per_channel_normalize=True,
	conditioning_method=ConditioningMethod.FIRST_FRAME,
	mixed_precision=True,
	callback_on_step_end=gradio_progress_callback,
	).images

	output_path = tempfile.mktemp(suffix=".mp4")
	video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
	video_np = (video_np * 255).astype(np.uint8)
	height, width = video_np.shape[1:3]
	out = cv2.VideoWriter(
	output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
	)
	for frame in video_np[..., ::-1]:
	out.write(frame)
	out.release()

	return output_path


	def create_advanced_options():
	with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
	seed = gr.Slider(
	label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=171198
	)
	inference_steps = gr.Slider(
	label="4.2 Inference Steps", minimum=1, maximum=100, step=1, value=40
	)
	guidance_scale = gr.Slider(
	label="4.3 Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0
	)

	height_slider = gr.Slider(
	label="4.4 Height",
	minimum=256,
	maximum=1024,
	step=64,
	value=704,
	visible=False,
	)
	width_slider = gr.Slider(
	label="4.5 Width",
	minimum=256,
	maximum=1024,
	step=64,
	value=1216,
	visible=False,
	)
	num_frames_slider = gr.Slider(
	label="4.5 Number of Frames",
	minimum=1,
	maximum=200,
	step=1,
	value=41,
	visible=False,
	)
	frame_rate = gr.Slider(
	label="4.7 Frame Rate",
	minimum=1,
	maximum=60,
	step=1,
	value=25,
	visible=False,
	)

	return [
	seed,
	inference_steps,
	guidance_scale,
	height_slider,
	width_slider,
	num_frames_slider,
	frame_rate,
	]


	# Define the Gradio interface with tabs
	with gr.Blocks(theme=gr.themes.Soft()) as iface:
	with gr.Row(elem_id="title-row"):
	gr.Markdown(
	"""
	<div style="text-align: center; margin-bottom: 1em">
	<h1 style="font-size: 2.5em; font-weight: 600; margin: 0.5em 0;">Video Generation with LTX Video</h1>
	</div>
	"""
	)
	with gr.Accordion(
	" 📖 Tips for Best Results", open=False, elem_id="instructions-accordion"
	):
	gr.Markdown(
	"""
	📝 Prompt Engineering

	When writing prompts, focus on detailed, chronological descriptions of actions and scenes. Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph. Start directly with the action, and keep descriptions literal and precise. Think like a cinematographer describing a shot list. Keep within 200 words.
	For best results, build your prompts using this structure:

	- Start with main action in a single sentence
	- Add specific details about movements and gestures
	- Describe character/object appearances precisely
	- Include background and environment details
	- Specify camera angles and movements
	- Describe lighting and colors
	- Note any changes or sudden events

	See examples for more inspiration.

	🎮 Parameter Guide

	- Resolution Preset: Higher resolutions for detailed scenes, lower for faster generation and simpler scenes
	- Seed: Save seed values to recreate specific styles or compositions you like
	- Guidance Scale: Higher values (5-7) for accurate prompt following, lower values (3-5) for more creative freedom
	- Inference Steps: More steps (40+) for quality, fewer steps (20-30) for speed
	"""
	)

	with gr.Tabs():
	# Text to Video Tab
	with gr.TabItem("Text to Video"):
	with gr.Row():
	with gr.Column():
	txt2vid_prompt = gr.Textbox(
	label="Step 1: Enter Your Prompt",
	placeholder="Describe the video you want to generate (minimum 50 characters)...",
	value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along.",
	lines=5,
	)
	txt2vid_negative_prompt = gr.Textbox(
	label="Step 2: Enter Negative Prompt (Optional)",
	placeholder="Describe what you don't want in the video...",
	value="worst quality, inconsistent motion...",
	lines=2,
	)

	txt2vid_preset = gr.Dropdown(
	choices=[p["label"] for p in preset_options],
	value="1216x704, 41 frames",
	label="Step 3: Choose Resolution Preset",
	)

	txt2vid_advanced = create_advanced_options()
	txt2vid_generate = gr.Button(
	"Step 5: Generate Video", variant="primary", size="lg"
	)

	with gr.Column():
	txt2vid_output = gr.Video(label="Step 6: Generated Output")

	# Image to Video Tab
	with gr.TabItem("Image to Video"):
	with gr.Row():
	with gr.Column():
	img2vid_image = gr.Image(
	type="filepath",
	label="Step 1: Upload Input Image",
	elem_id="image_upload",
	)
	img2vid_prompt = gr.Textbox(
	label="Step 2: Enter Your Prompt",
	placeholder="Describe how you want to animate the image (minimum 50 characters)...",
	value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery...",
	lines=5,
	)
	img2vid_negative_prompt = gr.Textbox(
	label="Step 3: Enter Negative Prompt (Optional)",
	placeholder="Describe what you don't want in the video...",
	value="worst quality, inconsistent motion...",
	lines=2,
	)

	img2vid_preset = gr.Dropdown(
	choices=[p["label"] for p in preset_options],
	value="1216x704, 41 frames",
	label="Step 4: Choose Resolution Preset",
	)

	img2vid_advanced = create_advanced_options()
	img2vid_generate = gr.Button(
	"Step 6: Generate Video", variant="primary", size="lg"
	)

	with gr.Column():
	img2vid_output = gr.Video(label="Step 7: Generated Output")

	# [Previous event handlers remain the same]
	txt2vid_preset.change(
	fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[4:]
	)

	txt2vid_generate.click(
	fn=generate_video_from_text,
	inputs=[txt2vid_prompt, txt2vid_negative_prompt, *txt2vid_advanced],
	outputs=txt2vid_output,
	)

	img2vid_preset.change(
	fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[4:]
	)

	img2vid_generate.click(
	fn=generate_video_from_image,
	inputs=[
	img2vid_image,
	img2vid_prompt,
	img2vid_negative_prompt,
	*img2vid_advanced,
	],
	outputs=img2vid_output,
	)

	iface.launch(share=True)