|
|
|
data: |
|
data_dir: [] |
|
caption_proportion: |
|
prompt: 1 |
|
external_caption_suffixes: [] |
|
external_clipscore_suffixes: [] |
|
clip_thr_temperature: 1.0 |
|
clip_thr: 0.0 |
|
sort_dataset: false |
|
load_text_feat: false |
|
load_vae_feat: false |
|
transform: default_train |
|
type: SanaWebDatasetMS |
|
image_size: 512 |
|
hq_only: false |
|
valid_num: 0 |
|
|
|
model: |
|
model: SanaMS_600M_P1_D28 |
|
image_size: 512 |
|
mixed_precision: fp16 |
|
fp32_attention: true |
|
load_from: |
|
resume_from: |
|
checkpoint: |
|
load_ema: false |
|
resume_lr_scheduler: true |
|
resume_optimizer: true |
|
aspect_ratio_type: ASPECT_RATIO_1024 |
|
multi_scale: true |
|
pe_interpolation: 1.0 |
|
micro_condition: false |
|
attn_type: linear |
|
cross_norm: false |
|
autocast_linear_attn: false |
|
ffn_type: glumbconv |
|
mlp_acts: |
|
- silu |
|
- silu |
|
- |
|
mlp_ratio: 2.5 |
|
use_pe: false |
|
qk_norm: false |
|
class_dropout_prob: 0.0 |
|
linear_head_dim: 32 |
|
|
|
cfg_scale: 4 |
|
guidance_type: classifier-free |
|
pag_applied_layers: [14] |
|
|
|
text_encoder: |
|
text_encoder_name: gemma-2-2b-it |
|
caption_channels: 2304 |
|
y_norm: false |
|
y_norm_scale_factor: 1.0 |
|
model_max_length: 300 |
|
chi_prompt: [] |
|
|
|
vae: |
|
vae_type: dc-ae |
|
vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0 |
|
scale_factor: 0.41407 |
|
vae_latent_dim: 32 |
|
vae_downsample_rate: 32 |
|
sample_posterior: true |
|
|
|
scheduler: |
|
train_sampling_steps: 1000 |
|
predict_v: True |
|
noise_schedule: linear_flow |
|
pred_sigma: false |
|
flow_shift: 1.0 |
|
weighting_scheme: logit_normal |
|
logit_mean: 0.0 |
|
logit_std: 1.0 |
|
vis_sampler: flow_dpm-solver |
|
|
|
train: |
|
num_workers: 4 |
|
seed: 43 |
|
train_batch_size: 32 |
|
num_epochs: 100 |
|
gradient_accumulation_steps: 1 |
|
grad_checkpointing: false |
|
gradient_clip: 1.0 |
|
gc_step: 1 |
|
|
|
optimizer: |
|
eps: 1.0e-10 |
|
lr: 0.0001 |
|
type: AdamW |
|
weight_decay: 0.03 |
|
lr_schedule: constant |
|
lr_schedule_args: |
|
num_warmup_steps: 500 |
|
auto_lr: |
|
rule: sqrt |
|
ema_rate: 0.9999 |
|
eval_batch_size: 16 |
|
use_fsdp: false |
|
use_flash_attn: false |
|
eval_sampling_steps: 250 |
|
lora_rank: 4 |
|
log_interval: 50 |
|
mask_type: 'null' |
|
mask_loss_coef: 0.0 |
|
load_mask_index: false |
|
snr_loss: false |
|
real_prompt_ratio: 1.0 |
|
debug_nan: false |
|
|
|
save_image_epochs: 1 |
|
save_model_epochs: 1 |
|
save_model_steps: 1000000 |
|
|
|
visualize: false |
|
null_embed_root: output/pretrained_models/ |
|
valid_prompt_embed_root: output/tmp_embed/ |
|
validation_prompts: |
|
- dog |
|
- portrait photo of a girl, photograph, highly detailed face, depth of field |
|
- Self-portrait oil painting, a beautiful cyborg with golden hair, 8k |
|
- Astronaut in a jungle, cold color palette, muted colors, detailed, 8k |
|
- A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece |
|
local_save_vis: false |
|
deterministic_validation: true |
|
online_metric: false |
|
eval_metric_step: 5000 |
|
online_metric_dir: metric_helper |
|
|
|
work_dir: /cache/exps/ |
|
skip_step: 0 |
|
|
|
loss_type: huber |
|
huber_c: 0.001 |
|
num_ddim_timesteps: 50 |
|
w_max: 15.0 |
|
w_min: 3.0 |
|
ema_decay: 0.95 |
|
|