Pyramid-Flow / scripts /train_pyramid_flow_without_ar.sh
sironagasuyagi's picture
Upload folder using huggingface_hub
910e2ad verified
#!/bin/bash
# This script is used for Pyramid-Flow Image and Video Generation Training (without using Temporal Pyramid and autoregressive training)
# Since the design of spatial pyramid and temporal pyramid are decoupled, we can only use the spatial pyramid flow
# to train with full-sequence diffusion, which is also more effective than the normal flow matching training strategy
GPUS=8 # The gpu number
TASK=t2i # t2i or t2v
SHARD_STRATEGY=zero2 # zero2 or zero3
MODEL_NAME=pyramid_flux # The model name, `pyramid_flux` or `pyramid_mmdit`
MODEL_PATH=/PATH/pyramid-flow-miniflux # The downloaded ckpt dir. IMPORTANT: It should match with model_name, flux or mmdit (sd3)
VARIANT=diffusion_transformer_image # The DiT Variant, diffusion_transformer_image or diffusion_transformer_384p
OUTPUT_DIR=/PATH/output_dir # The checkpoint saving dir
NUM_FRAMES=8 # e.g., 8 for 2s, 16 for 5s, 32 for 10s
BATCH_SIZE=4 # It should satisfy batch_size % 4 == 0
RESOLUTION="768p" # 384p or 768p
ANNO_FILE=annotation/image_text.jsonl # The annotation file path
torchrun --nproc_per_node $GPUS \
train/train_pyramid_flow.py \
--num_workers 8 \
--task $TASK \
--use_fsdp \
--fsdp_shard_strategy $SHARD_STRATEGY \
--use_flash_attn \
--load_text_encoder \
--load_vae \
--model_name $MODEL_NAME \
--model_path $MODEL_PATH \
--model_dtype bf16 \
--model_variant $VARIANT \
--schedule_shift 1.0 \
--gradient_accumulation_steps 1 \
--output_dir $OUTPUT_DIR \
--batch_size $BATCH_SIZE \
--max_frames $NUM_FRAMES \
--resolution $RESOLUTION \
--anno_file $ANNO_FILE \
--frame_per_unit 1 \
--lr_scheduler constant_with_warmup \
--opt adamw \
--opt_beta1 0.9 \
--opt_beta2 0.95 \
--seed 42 \
--weight_decay 1e-4 \
--clip_grad 1.0 \
--lr 1e-4 \
--warmup_steps 1000 \
--epochs 20 \
--iters_per_epoch 2000 \
--report_to tensorboard \
--print_freq 40 \
--save_ckpt_freq 1