lllyasviel
commited on
Commit
·
ffcc874
1
Parent(s):
4ae54b7
- .gitignore +0 -2
- models/checkpoints/put_checkpoints_here +0 -0
- models/clip/put_clip_or_text_encoder_models_here +0 -0
- models/clip_vision/put_clip_vision_models_here +0 -0
- models/configs/anything_v3.yaml +73 -0
- models/configs/v1-inference.yaml +70 -0
- models/configs/v1-inference_clip_skip_2.yaml +73 -0
- models/configs/v1-inference_clip_skip_2_fp16.yaml +74 -0
- models/configs/v1-inference_fp16.yaml +71 -0
- models/configs/v1-inpainting-inference.yaml +71 -0
- models/configs/v2-inference-v.yaml +68 -0
- models/configs/v2-inference-v_fp32.yaml +68 -0
- models/configs/v2-inference.yaml +67 -0
- models/configs/v2-inference_fp32.yaml +67 -0
- models/configs/v2-inpainting-inference.yaml +158 -0
- models/controlnet/put_controlnets_and_t2i_here +0 -0
- models/diffusers/put_diffusers_models_here +0 -0
- models/embeddings/put_embeddings_or_textual_inversion_concepts_here +0 -0
- models/gligen/put_gligen_models_here +0 -0
- models/hypernetworks/put_hypernetworks_here +0 -0
- models/loras/put_loras_here +0 -0
- models/style_models/put_t2i_style_model_here +0 -0
- models/unet/put_unet_files_here +0 -0
- models/upscale_models/put_esrgan_and_other_upscale_models_here +0 -0
- models/vae/put_vae_here +0 -0
- models/vae_approx/put_taesd_encoder_pth_and_taesd_decoder_pth_here +0 -0
- modules/{sd.py → core.py} +7 -2
- webui.py +22 -39
.gitignore
CHANGED
@@ -5,8 +5,6 @@ __pycache__
|
|
5 |
/repositories
|
6 |
/venv
|
7 |
/tmp
|
8 |
-
/model.ckpt
|
9 |
-
/models/*
|
10 |
/ui-config.json
|
11 |
/outputs
|
12 |
/config.json
|
|
|
5 |
/repositories
|
6 |
/venv
|
7 |
/tmp
|
|
|
|
|
8 |
/ui-config.json
|
9 |
/outputs
|
10 |
/config.json
|
models/checkpoints/put_checkpoints_here
ADDED
File without changes
|
models/clip/put_clip_or_text_encoder_models_here
ADDED
File without changes
|
models/clip_vision/put_clip_vision_models_here
ADDED
File without changes
|
models/configs/anything_v3.yaml
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-04
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 10000 ]
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
image_size: 32 # unused
|
33 |
+
in_channels: 4
|
34 |
+
out_channels: 4
|
35 |
+
model_channels: 320
|
36 |
+
attention_resolutions: [ 4, 2, 1 ]
|
37 |
+
num_res_blocks: 2
|
38 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
39 |
+
num_heads: 8
|
40 |
+
use_spatial_transformer: True
|
41 |
+
transformer_depth: 1
|
42 |
+
context_dim: 768
|
43 |
+
use_checkpoint: True
|
44 |
+
legacy: False
|
45 |
+
|
46 |
+
first_stage_config:
|
47 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
48 |
+
params:
|
49 |
+
embed_dim: 4
|
50 |
+
monitor: val/rec_loss
|
51 |
+
ddconfig:
|
52 |
+
double_z: true
|
53 |
+
z_channels: 4
|
54 |
+
resolution: 256
|
55 |
+
in_channels: 3
|
56 |
+
out_ch: 3
|
57 |
+
ch: 128
|
58 |
+
ch_mult:
|
59 |
+
- 1
|
60 |
+
- 2
|
61 |
+
- 4
|
62 |
+
- 4
|
63 |
+
num_res_blocks: 2
|
64 |
+
attn_resolutions: []
|
65 |
+
dropout: 0.0
|
66 |
+
lossconfig:
|
67 |
+
target: torch.nn.Identity
|
68 |
+
|
69 |
+
cond_stage_config:
|
70 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
71 |
+
params:
|
72 |
+
layer: "hidden"
|
73 |
+
layer_idx: -2
|
models/configs/v1-inference.yaml
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-04
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 10000 ]
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
image_size: 32 # unused
|
33 |
+
in_channels: 4
|
34 |
+
out_channels: 4
|
35 |
+
model_channels: 320
|
36 |
+
attention_resolutions: [ 4, 2, 1 ]
|
37 |
+
num_res_blocks: 2
|
38 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
39 |
+
num_heads: 8
|
40 |
+
use_spatial_transformer: True
|
41 |
+
transformer_depth: 1
|
42 |
+
context_dim: 768
|
43 |
+
use_checkpoint: True
|
44 |
+
legacy: False
|
45 |
+
|
46 |
+
first_stage_config:
|
47 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
48 |
+
params:
|
49 |
+
embed_dim: 4
|
50 |
+
monitor: val/rec_loss
|
51 |
+
ddconfig:
|
52 |
+
double_z: true
|
53 |
+
z_channels: 4
|
54 |
+
resolution: 256
|
55 |
+
in_channels: 3
|
56 |
+
out_ch: 3
|
57 |
+
ch: 128
|
58 |
+
ch_mult:
|
59 |
+
- 1
|
60 |
+
- 2
|
61 |
+
- 4
|
62 |
+
- 4
|
63 |
+
num_res_blocks: 2
|
64 |
+
attn_resolutions: []
|
65 |
+
dropout: 0.0
|
66 |
+
lossconfig:
|
67 |
+
target: torch.nn.Identity
|
68 |
+
|
69 |
+
cond_stage_config:
|
70 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
models/configs/v1-inference_clip_skip_2.yaml
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-04
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 10000 ]
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
image_size: 32 # unused
|
33 |
+
in_channels: 4
|
34 |
+
out_channels: 4
|
35 |
+
model_channels: 320
|
36 |
+
attention_resolutions: [ 4, 2, 1 ]
|
37 |
+
num_res_blocks: 2
|
38 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
39 |
+
num_heads: 8
|
40 |
+
use_spatial_transformer: True
|
41 |
+
transformer_depth: 1
|
42 |
+
context_dim: 768
|
43 |
+
use_checkpoint: True
|
44 |
+
legacy: False
|
45 |
+
|
46 |
+
first_stage_config:
|
47 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
48 |
+
params:
|
49 |
+
embed_dim: 4
|
50 |
+
monitor: val/rec_loss
|
51 |
+
ddconfig:
|
52 |
+
double_z: true
|
53 |
+
z_channels: 4
|
54 |
+
resolution: 256
|
55 |
+
in_channels: 3
|
56 |
+
out_ch: 3
|
57 |
+
ch: 128
|
58 |
+
ch_mult:
|
59 |
+
- 1
|
60 |
+
- 2
|
61 |
+
- 4
|
62 |
+
- 4
|
63 |
+
num_res_blocks: 2
|
64 |
+
attn_resolutions: []
|
65 |
+
dropout: 0.0
|
66 |
+
lossconfig:
|
67 |
+
target: torch.nn.Identity
|
68 |
+
|
69 |
+
cond_stage_config:
|
70 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
71 |
+
params:
|
72 |
+
layer: "hidden"
|
73 |
+
layer_idx: -2
|
models/configs/v1-inference_clip_skip_2_fp16.yaml
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-04
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 10000 ]
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
use_fp16: True
|
33 |
+
image_size: 32 # unused
|
34 |
+
in_channels: 4
|
35 |
+
out_channels: 4
|
36 |
+
model_channels: 320
|
37 |
+
attention_resolutions: [ 4, 2, 1 ]
|
38 |
+
num_res_blocks: 2
|
39 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
40 |
+
num_heads: 8
|
41 |
+
use_spatial_transformer: True
|
42 |
+
transformer_depth: 1
|
43 |
+
context_dim: 768
|
44 |
+
use_checkpoint: True
|
45 |
+
legacy: False
|
46 |
+
|
47 |
+
first_stage_config:
|
48 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
49 |
+
params:
|
50 |
+
embed_dim: 4
|
51 |
+
monitor: val/rec_loss
|
52 |
+
ddconfig:
|
53 |
+
double_z: true
|
54 |
+
z_channels: 4
|
55 |
+
resolution: 256
|
56 |
+
in_channels: 3
|
57 |
+
out_ch: 3
|
58 |
+
ch: 128
|
59 |
+
ch_mult:
|
60 |
+
- 1
|
61 |
+
- 2
|
62 |
+
- 4
|
63 |
+
- 4
|
64 |
+
num_res_blocks: 2
|
65 |
+
attn_resolutions: []
|
66 |
+
dropout: 0.0
|
67 |
+
lossconfig:
|
68 |
+
target: torch.nn.Identity
|
69 |
+
|
70 |
+
cond_stage_config:
|
71 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
72 |
+
params:
|
73 |
+
layer: "hidden"
|
74 |
+
layer_idx: -2
|
models/configs/v1-inference_fp16.yaml
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-04
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 10000 ]
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
use_fp16: True
|
33 |
+
image_size: 32 # unused
|
34 |
+
in_channels: 4
|
35 |
+
out_channels: 4
|
36 |
+
model_channels: 320
|
37 |
+
attention_resolutions: [ 4, 2, 1 ]
|
38 |
+
num_res_blocks: 2
|
39 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
40 |
+
num_heads: 8
|
41 |
+
use_spatial_transformer: True
|
42 |
+
transformer_depth: 1
|
43 |
+
context_dim: 768
|
44 |
+
use_checkpoint: True
|
45 |
+
legacy: False
|
46 |
+
|
47 |
+
first_stage_config:
|
48 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
49 |
+
params:
|
50 |
+
embed_dim: 4
|
51 |
+
monitor: val/rec_loss
|
52 |
+
ddconfig:
|
53 |
+
double_z: true
|
54 |
+
z_channels: 4
|
55 |
+
resolution: 256
|
56 |
+
in_channels: 3
|
57 |
+
out_ch: 3
|
58 |
+
ch: 128
|
59 |
+
ch_mult:
|
60 |
+
- 1
|
61 |
+
- 2
|
62 |
+
- 4
|
63 |
+
- 4
|
64 |
+
num_res_blocks: 2
|
65 |
+
attn_resolutions: []
|
66 |
+
dropout: 0.0
|
67 |
+
lossconfig:
|
68 |
+
target: torch.nn.Identity
|
69 |
+
|
70 |
+
cond_stage_config:
|
71 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
models/configs/v1-inpainting-inference.yaml
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 7.5e-05
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: hybrid # important
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
finetune_keys: null
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
image_size: 32 # unused
|
33 |
+
in_channels: 9 # 4 data + 4 downscaled image + 1 mask
|
34 |
+
out_channels: 4
|
35 |
+
model_channels: 320
|
36 |
+
attention_resolutions: [ 4, 2, 1 ]
|
37 |
+
num_res_blocks: 2
|
38 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
39 |
+
num_heads: 8
|
40 |
+
use_spatial_transformer: True
|
41 |
+
transformer_depth: 1
|
42 |
+
context_dim: 768
|
43 |
+
use_checkpoint: True
|
44 |
+
legacy: False
|
45 |
+
|
46 |
+
first_stage_config:
|
47 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
48 |
+
params:
|
49 |
+
embed_dim: 4
|
50 |
+
monitor: val/rec_loss
|
51 |
+
ddconfig:
|
52 |
+
double_z: true
|
53 |
+
z_channels: 4
|
54 |
+
resolution: 256
|
55 |
+
in_channels: 3
|
56 |
+
out_ch: 3
|
57 |
+
ch: 128
|
58 |
+
ch_mult:
|
59 |
+
- 1
|
60 |
+
- 2
|
61 |
+
- 4
|
62 |
+
- 4
|
63 |
+
num_res_blocks: 2
|
64 |
+
attn_resolutions: []
|
65 |
+
dropout: 0.0
|
66 |
+
lossconfig:
|
67 |
+
target: torch.nn.Identity
|
68 |
+
|
69 |
+
cond_stage_config:
|
70 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
71 |
+
|
models/configs/v2-inference-v.yaml
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-4
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
parameterization: "v"
|
6 |
+
linear_start: 0.00085
|
7 |
+
linear_end: 0.0120
|
8 |
+
num_timesteps_cond: 1
|
9 |
+
log_every_t: 200
|
10 |
+
timesteps: 1000
|
11 |
+
first_stage_key: "jpg"
|
12 |
+
cond_stage_key: "txt"
|
13 |
+
image_size: 64
|
14 |
+
channels: 4
|
15 |
+
cond_stage_trainable: false
|
16 |
+
conditioning_key: crossattn
|
17 |
+
monitor: val/loss_simple_ema
|
18 |
+
scale_factor: 0.18215
|
19 |
+
use_ema: False # we set this to false because this is an inference only config
|
20 |
+
|
21 |
+
unet_config:
|
22 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
23 |
+
params:
|
24 |
+
use_checkpoint: True
|
25 |
+
use_fp16: True
|
26 |
+
image_size: 32 # unused
|
27 |
+
in_channels: 4
|
28 |
+
out_channels: 4
|
29 |
+
model_channels: 320
|
30 |
+
attention_resolutions: [ 4, 2, 1 ]
|
31 |
+
num_res_blocks: 2
|
32 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
33 |
+
num_head_channels: 64 # need to fix for flash-attn
|
34 |
+
use_spatial_transformer: True
|
35 |
+
use_linear_in_transformer: True
|
36 |
+
transformer_depth: 1
|
37 |
+
context_dim: 1024
|
38 |
+
legacy: False
|
39 |
+
|
40 |
+
first_stage_config:
|
41 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
42 |
+
params:
|
43 |
+
embed_dim: 4
|
44 |
+
monitor: val/rec_loss
|
45 |
+
ddconfig:
|
46 |
+
#attn_type: "vanilla-xformers"
|
47 |
+
double_z: true
|
48 |
+
z_channels: 4
|
49 |
+
resolution: 256
|
50 |
+
in_channels: 3
|
51 |
+
out_ch: 3
|
52 |
+
ch: 128
|
53 |
+
ch_mult:
|
54 |
+
- 1
|
55 |
+
- 2
|
56 |
+
- 4
|
57 |
+
- 4
|
58 |
+
num_res_blocks: 2
|
59 |
+
attn_resolutions: []
|
60 |
+
dropout: 0.0
|
61 |
+
lossconfig:
|
62 |
+
target: torch.nn.Identity
|
63 |
+
|
64 |
+
cond_stage_config:
|
65 |
+
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
66 |
+
params:
|
67 |
+
freeze: True
|
68 |
+
layer: "penultimate"
|
models/configs/v2-inference-v_fp32.yaml
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-4
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
parameterization: "v"
|
6 |
+
linear_start: 0.00085
|
7 |
+
linear_end: 0.0120
|
8 |
+
num_timesteps_cond: 1
|
9 |
+
log_every_t: 200
|
10 |
+
timesteps: 1000
|
11 |
+
first_stage_key: "jpg"
|
12 |
+
cond_stage_key: "txt"
|
13 |
+
image_size: 64
|
14 |
+
channels: 4
|
15 |
+
cond_stage_trainable: false
|
16 |
+
conditioning_key: crossattn
|
17 |
+
monitor: val/loss_simple_ema
|
18 |
+
scale_factor: 0.18215
|
19 |
+
use_ema: False # we set this to false because this is an inference only config
|
20 |
+
|
21 |
+
unet_config:
|
22 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
23 |
+
params:
|
24 |
+
use_checkpoint: True
|
25 |
+
use_fp16: False
|
26 |
+
image_size: 32 # unused
|
27 |
+
in_channels: 4
|
28 |
+
out_channels: 4
|
29 |
+
model_channels: 320
|
30 |
+
attention_resolutions: [ 4, 2, 1 ]
|
31 |
+
num_res_blocks: 2
|
32 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
33 |
+
num_head_channels: 64 # need to fix for flash-attn
|
34 |
+
use_spatial_transformer: True
|
35 |
+
use_linear_in_transformer: True
|
36 |
+
transformer_depth: 1
|
37 |
+
context_dim: 1024
|
38 |
+
legacy: False
|
39 |
+
|
40 |
+
first_stage_config:
|
41 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
42 |
+
params:
|
43 |
+
embed_dim: 4
|
44 |
+
monitor: val/rec_loss
|
45 |
+
ddconfig:
|
46 |
+
#attn_type: "vanilla-xformers"
|
47 |
+
double_z: true
|
48 |
+
z_channels: 4
|
49 |
+
resolution: 256
|
50 |
+
in_channels: 3
|
51 |
+
out_ch: 3
|
52 |
+
ch: 128
|
53 |
+
ch_mult:
|
54 |
+
- 1
|
55 |
+
- 2
|
56 |
+
- 4
|
57 |
+
- 4
|
58 |
+
num_res_blocks: 2
|
59 |
+
attn_resolutions: []
|
60 |
+
dropout: 0.0
|
61 |
+
lossconfig:
|
62 |
+
target: torch.nn.Identity
|
63 |
+
|
64 |
+
cond_stage_config:
|
65 |
+
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
66 |
+
params:
|
67 |
+
freeze: True
|
68 |
+
layer: "penultimate"
|
models/configs/v2-inference.yaml
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-4
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False # we set this to false because this is an inference only config
|
19 |
+
|
20 |
+
unet_config:
|
21 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
22 |
+
params:
|
23 |
+
use_checkpoint: True
|
24 |
+
use_fp16: True
|
25 |
+
image_size: 32 # unused
|
26 |
+
in_channels: 4
|
27 |
+
out_channels: 4
|
28 |
+
model_channels: 320
|
29 |
+
attention_resolutions: [ 4, 2, 1 ]
|
30 |
+
num_res_blocks: 2
|
31 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
32 |
+
num_head_channels: 64 # need to fix for flash-attn
|
33 |
+
use_spatial_transformer: True
|
34 |
+
use_linear_in_transformer: True
|
35 |
+
transformer_depth: 1
|
36 |
+
context_dim: 1024
|
37 |
+
legacy: False
|
38 |
+
|
39 |
+
first_stage_config:
|
40 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
41 |
+
params:
|
42 |
+
embed_dim: 4
|
43 |
+
monitor: val/rec_loss
|
44 |
+
ddconfig:
|
45 |
+
#attn_type: "vanilla-xformers"
|
46 |
+
double_z: true
|
47 |
+
z_channels: 4
|
48 |
+
resolution: 256
|
49 |
+
in_channels: 3
|
50 |
+
out_ch: 3
|
51 |
+
ch: 128
|
52 |
+
ch_mult:
|
53 |
+
- 1
|
54 |
+
- 2
|
55 |
+
- 4
|
56 |
+
- 4
|
57 |
+
num_res_blocks: 2
|
58 |
+
attn_resolutions: []
|
59 |
+
dropout: 0.0
|
60 |
+
lossconfig:
|
61 |
+
target: torch.nn.Identity
|
62 |
+
|
63 |
+
cond_stage_config:
|
64 |
+
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
65 |
+
params:
|
66 |
+
freeze: True
|
67 |
+
layer: "penultimate"
|
models/configs/v2-inference_fp32.yaml
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-4
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False # we set this to false because this is an inference only config
|
19 |
+
|
20 |
+
unet_config:
|
21 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
22 |
+
params:
|
23 |
+
use_checkpoint: True
|
24 |
+
use_fp16: False
|
25 |
+
image_size: 32 # unused
|
26 |
+
in_channels: 4
|
27 |
+
out_channels: 4
|
28 |
+
model_channels: 320
|
29 |
+
attention_resolutions: [ 4, 2, 1 ]
|
30 |
+
num_res_blocks: 2
|
31 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
32 |
+
num_head_channels: 64 # need to fix for flash-attn
|
33 |
+
use_spatial_transformer: True
|
34 |
+
use_linear_in_transformer: True
|
35 |
+
transformer_depth: 1
|
36 |
+
context_dim: 1024
|
37 |
+
legacy: False
|
38 |
+
|
39 |
+
first_stage_config:
|
40 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
41 |
+
params:
|
42 |
+
embed_dim: 4
|
43 |
+
monitor: val/rec_loss
|
44 |
+
ddconfig:
|
45 |
+
#attn_type: "vanilla-xformers"
|
46 |
+
double_z: true
|
47 |
+
z_channels: 4
|
48 |
+
resolution: 256
|
49 |
+
in_channels: 3
|
50 |
+
out_ch: 3
|
51 |
+
ch: 128
|
52 |
+
ch_mult:
|
53 |
+
- 1
|
54 |
+
- 2
|
55 |
+
- 4
|
56 |
+
- 4
|
57 |
+
num_res_blocks: 2
|
58 |
+
attn_resolutions: []
|
59 |
+
dropout: 0.0
|
60 |
+
lossconfig:
|
61 |
+
target: torch.nn.Identity
|
62 |
+
|
63 |
+
cond_stage_config:
|
64 |
+
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
65 |
+
params:
|
66 |
+
freeze: True
|
67 |
+
layer: "penultimate"
|
models/configs/v2-inpainting-inference.yaml
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 5.0e-05
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false
|
15 |
+
conditioning_key: hybrid
|
16 |
+
scale_factor: 0.18215
|
17 |
+
monitor: val/loss_simple_ema
|
18 |
+
finetune_keys: null
|
19 |
+
use_ema: False
|
20 |
+
|
21 |
+
unet_config:
|
22 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
23 |
+
params:
|
24 |
+
use_checkpoint: True
|
25 |
+
image_size: 32 # unused
|
26 |
+
in_channels: 9
|
27 |
+
out_channels: 4
|
28 |
+
model_channels: 320
|
29 |
+
attention_resolutions: [ 4, 2, 1 ]
|
30 |
+
num_res_blocks: 2
|
31 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
32 |
+
num_head_channels: 64 # need to fix for flash-attn
|
33 |
+
use_spatial_transformer: True
|
34 |
+
use_linear_in_transformer: True
|
35 |
+
transformer_depth: 1
|
36 |
+
context_dim: 1024
|
37 |
+
legacy: False
|
38 |
+
|
39 |
+
first_stage_config:
|
40 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
41 |
+
params:
|
42 |
+
embed_dim: 4
|
43 |
+
monitor: val/rec_loss
|
44 |
+
ddconfig:
|
45 |
+
#attn_type: "vanilla-xformers"
|
46 |
+
double_z: true
|
47 |
+
z_channels: 4
|
48 |
+
resolution: 256
|
49 |
+
in_channels: 3
|
50 |
+
out_ch: 3
|
51 |
+
ch: 128
|
52 |
+
ch_mult:
|
53 |
+
- 1
|
54 |
+
- 2
|
55 |
+
- 4
|
56 |
+
- 4
|
57 |
+
num_res_blocks: 2
|
58 |
+
attn_resolutions: [ ]
|
59 |
+
dropout: 0.0
|
60 |
+
lossconfig:
|
61 |
+
target: torch.nn.Identity
|
62 |
+
|
63 |
+
cond_stage_config:
|
64 |
+
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
65 |
+
params:
|
66 |
+
freeze: True
|
67 |
+
layer: "penultimate"
|
68 |
+
|
69 |
+
|
70 |
+
data:
|
71 |
+
target: ldm.data.laion.WebDataModuleFromConfig
|
72 |
+
params:
|
73 |
+
tar_base: null # for concat as in LAION-A
|
74 |
+
p_unsafe_threshold: 0.1
|
75 |
+
filter_word_list: "data/filters.yaml"
|
76 |
+
max_pwatermark: 0.45
|
77 |
+
batch_size: 8
|
78 |
+
num_workers: 6
|
79 |
+
multinode: True
|
80 |
+
min_size: 512
|
81 |
+
train:
|
82 |
+
shards:
|
83 |
+
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
|
84 |
+
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
|
85 |
+
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
|
86 |
+
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
|
87 |
+
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar"
|
88 |
+
shuffle: 10000
|
89 |
+
image_key: jpg
|
90 |
+
image_transforms:
|
91 |
+
- target: torchvision.transforms.Resize
|
92 |
+
params:
|
93 |
+
size: 512
|
94 |
+
interpolation: 3
|
95 |
+
- target: torchvision.transforms.RandomCrop
|
96 |
+
params:
|
97 |
+
size: 512
|
98 |
+
postprocess:
|
99 |
+
target: ldm.data.laion.AddMask
|
100 |
+
params:
|
101 |
+
mode: "512train-large"
|
102 |
+
p_drop: 0.25
|
103 |
+
# NOTE use enough shards to avoid empty validation loops in workers
|
104 |
+
validation:
|
105 |
+
shards:
|
106 |
+
- "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
|
107 |
+
shuffle: 0
|
108 |
+
image_key: jpg
|
109 |
+
image_transforms:
|
110 |
+
- target: torchvision.transforms.Resize
|
111 |
+
params:
|
112 |
+
size: 512
|
113 |
+
interpolation: 3
|
114 |
+
- target: torchvision.transforms.CenterCrop
|
115 |
+
params:
|
116 |
+
size: 512
|
117 |
+
postprocess:
|
118 |
+
target: ldm.data.laion.AddMask
|
119 |
+
params:
|
120 |
+
mode: "512train-large"
|
121 |
+
p_drop: 0.25
|
122 |
+
|
123 |
+
lightning:
|
124 |
+
find_unused_parameters: True
|
125 |
+
modelcheckpoint:
|
126 |
+
params:
|
127 |
+
every_n_train_steps: 5000
|
128 |
+
|
129 |
+
callbacks:
|
130 |
+
metrics_over_trainsteps_checkpoint:
|
131 |
+
params:
|
132 |
+
every_n_train_steps: 10000
|
133 |
+
|
134 |
+
image_logger:
|
135 |
+
target: main.ImageLogger
|
136 |
+
params:
|
137 |
+
enable_autocast: False
|
138 |
+
disabled: False
|
139 |
+
batch_frequency: 1000
|
140 |
+
max_images: 4
|
141 |
+
increase_log_steps: False
|
142 |
+
log_first_step: False
|
143 |
+
log_images_kwargs:
|
144 |
+
use_ema_scope: False
|
145 |
+
inpaint: False
|
146 |
+
plot_progressive_rows: False
|
147 |
+
plot_diffusion_rows: False
|
148 |
+
N: 4
|
149 |
+
unconditional_guidance_scale: 5.0
|
150 |
+
unconditional_guidance_label: [""]
|
151 |
+
ddim_steps: 50 # todo check these out for depth2img,
|
152 |
+
ddim_eta: 0.0 # todo check these out for depth2img,
|
153 |
+
|
154 |
+
trainer:
|
155 |
+
benchmark: True
|
156 |
+
val_check_interval: 5000000
|
157 |
+
num_sanity_val_steps: 0
|
158 |
+
accumulate_grad_batches: 1
|
models/controlnet/put_controlnets_and_t2i_here
ADDED
File without changes
|
models/diffusers/put_diffusers_models_here
ADDED
File without changes
|
models/embeddings/put_embeddings_or_textual_inversion_concepts_here
ADDED
File without changes
|
models/gligen/put_gligen_models_here
ADDED
File without changes
|
models/hypernetworks/put_hypernetworks_here
ADDED
File without changes
|
models/loras/put_loras_here
ADDED
File without changes
|
models/style_models/put_t2i_style_model_here
ADDED
File without changes
|
models/unet/put_unet_files_here
ADDED
File without changes
|
models/upscale_models/put_esrgan_and_other_upscale_models_here
ADDED
File without changes
|
models/vae/put_vae_here
ADDED
File without changes
|
models/vae_approx/put_taesd_encoder_pth_and_taesd_decoder_pth_here
ADDED
File without changes
|
modules/{sd.py → core.py}
RENAMED
@@ -31,13 +31,18 @@ def encode_prompt_condition(clip, prompt):
|
|
31 |
return opCLIPTextEncode.encode(clip=clip, text=prompt)[0]
|
32 |
|
33 |
|
|
|
|
|
|
|
|
|
|
|
34 |
@torch.no_grad()
|
35 |
def decode_vae(vae, latent_image):
|
36 |
return opVAEDecode.decode(samples=latent_image, vae=vae)[0]
|
37 |
|
38 |
|
39 |
@torch.no_grad()
|
40 |
-
def ksample(
|
41 |
sampler_name='euler_ancestral', scheduler='normal', start_at_step=None, end_at_step=None,
|
42 |
return_with_leftover_noise=False):
|
43 |
return opKSamplerAdvanced.sample(
|
@@ -50,7 +55,7 @@ def ksample(model, positive_condition, negative_condition, latent_image, add_noi
|
|
50 |
start_at_step=0 if start_at_step is None else start_at_step,
|
51 |
end_at_step=steps if end_at_step is None else end_at_step,
|
52 |
return_with_leftover_noise='enable' if return_with_leftover_noise else 'disable',
|
53 |
-
model=
|
54 |
positive=positive_condition,
|
55 |
negative=negative_condition,
|
56 |
latent_image=latent_image,
|
|
|
31 |
return opCLIPTextEncode.encode(clip=clip, text=prompt)[0]
|
32 |
|
33 |
|
34 |
+
@torch.no_grad()
|
35 |
+
def generate_empty_latent(width=1024, height=1024, batch_size=1):
|
36 |
+
return opEmptyLatentImage.generate(width=width, height=height, batch_size=batch_size)[0]
|
37 |
+
|
38 |
+
|
39 |
@torch.no_grad()
|
40 |
def decode_vae(vae, latent_image):
|
41 |
return opVAEDecode.decode(samples=latent_image, vae=vae)[0]
|
42 |
|
43 |
|
44 |
@torch.no_grad()
|
45 |
+
def ksample(unet, positive_condition, negative_condition, latent_image, add_noise=True, noise_seed=None, steps=25, cfg=9,
|
46 |
sampler_name='euler_ancestral', scheduler='normal', start_at_step=None, end_at_step=None,
|
47 |
return_with_leftover_noise=False):
|
48 |
return opKSamplerAdvanced.sample(
|
|
|
55 |
start_at_step=0 if start_at_step is None else start_at_step,
|
56 |
end_at_step=steps if end_at_step is None else end_at_step,
|
57 |
return_with_leftover_noise='enable' if return_with_leftover_noise else 'disable',
|
58 |
+
model=unet,
|
59 |
positive=positive_condition,
|
60 |
negative=negative_condition,
|
61 |
latent_image=latent_image,
|
webui.py
CHANGED
@@ -2,49 +2,32 @@ import os
|
|
2 |
import random
|
3 |
import torch
|
4 |
import numpy as np
|
|
|
5 |
|
6 |
-
from comfy.sd import load_checkpoint_guess_config
|
7 |
-
from nodes import VAEDecode, KSamplerAdvanced, EmptyLatentImage, SaveImage, CLIPTextEncode
|
8 |
from modules.path import modelfile_path
|
9 |
|
10 |
|
11 |
xl_base_filename = os.path.join(modelfile_path, 'sd_xl_base_1.0.safetensors')
|
12 |
xl_refiner_filename = os.path.join(modelfile_path, 'sd_xl_refiner_1.0.safetensors')
|
13 |
|
14 |
-
xl_base
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
positive_conditions
|
24 |
-
negative_conditions
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
start_at_step=0,
|
36 |
-
end_at_step=25,
|
37 |
-
return_with_leftover_noise="enable",
|
38 |
-
model=xl_base,
|
39 |
-
positive=positive_conditions,
|
40 |
-
negative=negative_conditions,
|
41 |
-
latent_image=initial_latent_image,
|
42 |
-
)[0]
|
43 |
-
|
44 |
-
vae_decoded = opVAEDecode.decode(samples=samples, vae=xl_base_vae)[0]
|
45 |
-
|
46 |
-
for image in vae_decoded:
|
47 |
-
i = 255. * image.cpu().numpy()
|
48 |
-
img = np.clip(i, 0, 255).astype(np.uint8)
|
49 |
-
import cv2
|
50 |
-
cv2.imwrite('a.png', img[:, :, ::-1])
|
|
|
2 |
import random
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
+
import modules.core as core
|
6 |
|
|
|
|
|
7 |
from modules.path import modelfile_path
|
8 |
|
9 |
|
10 |
xl_base_filename = os.path.join(modelfile_path, 'sd_xl_base_1.0.safetensors')
|
11 |
xl_refiner_filename = os.path.join(modelfile_path, 'sd_xl_refiner_1.0.safetensors')
|
12 |
|
13 |
+
xl_base = core.load_model(xl_base_filename)
|
14 |
+
|
15 |
+
positive_conditions = core.encode_prompt_condition(clip=xl_base.clip, prompt='a handsome man in forest')
|
16 |
+
negative_conditions = core.encode_prompt_condition(clip=xl_base.clip, prompt='bad, ugly')
|
17 |
+
|
18 |
+
empty_latent = core.generate_empty_latent(width=1024, height=1024, batch_size=1)
|
19 |
+
|
20 |
+
sampled_latent = core.ksample(
|
21 |
+
unet=xl_base.unet,
|
22 |
+
positive_condition=positive_conditions,
|
23 |
+
negative_condition=negative_conditions,
|
24 |
+
latent_image=empty_latent
|
25 |
+
)
|
26 |
+
|
27 |
+
decoded_latent = core.decode_vae(vae=xl_base.vae, latent_image=sampled_latent)
|
28 |
+
|
29 |
+
images = core.image_to_numpy(decoded_latent)
|
30 |
+
|
31 |
+
for image in images:
|
32 |
+
import cv2
|
33 |
+
cv2.imwrite('a.png', image[:, :, ::-1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|