Update pipeline.py
Browse files- pipeline.py +277 -150
pipeline.py
CHANGED
|
@@ -15,10 +15,9 @@
|
|
| 15 |
#
|
| 16 |
# modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
|
| 17 |
# Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
|
| 18 |
-
from ppdiffusers.utils import check_min_version
|
| 19 |
-
check_min_version("0.14.1")
|
| 20 |
|
| 21 |
import inspect
|
|
|
|
| 22 |
from typing import Any, Callable, Dict, List, Optional, Union
|
| 23 |
|
| 24 |
import paddle
|
|
@@ -39,9 +38,102 @@ from ppdiffusers.utils import (
|
|
| 39 |
logging,
|
| 40 |
randn_tensor,
|
| 41 |
safetensors_load,
|
|
|
|
| 42 |
torch_load,
|
| 43 |
)
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
| 46 |
|
| 47 |
|
|
@@ -138,6 +230,7 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
| 138 |
"kdpm2-ancestral",
|
| 139 |
"kdpm2",
|
| 140 |
]
|
|
|
|
| 141 |
|
| 142 |
def add_ti_embedding_dir(self, embeddings_dir):
|
| 143 |
self.sj.embedding_db.add_embedding_dir(embeddings_dir)
|
|
@@ -147,6 +240,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
| 147 |
self.sj.embedding_db.clear_embedding_dirs()
|
| 148 |
self.sj.embedding_db.load_textual_inversion_embeddings(True)
|
| 149 |
|
|
|
|
|
|
|
|
|
|
| 150 |
def switch_scheduler(self, scheduler_type="ddim"):
|
| 151 |
scheduler_type = scheduler_type.lower()
|
| 152 |
from ppdiffusers import (
|
|
@@ -409,8 +505,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
| 409 |
callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
|
| 410 |
callback_steps: Optional[int] = 1,
|
| 411 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
| 412 |
-
clip_skip: int =
|
| 413 |
controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
|
|
|
|
| 414 |
):
|
| 415 |
r"""
|
| 416 |
Function invoked when calling the pipeline for generation.
|
|
@@ -468,12 +565,14 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
| 468 |
A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
|
| 469 |
`self.processor` in
|
| 470 |
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
| 471 |
-
clip_skip (`int`, *optional*, defaults to
|
| 472 |
-
CLIP_stop_at_last_layers, if clip_skip
|
| 473 |
controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
|
| 474 |
The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
|
| 475 |
to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
|
| 476 |
corresponding scale as a list.
|
|
|
|
|
|
|
| 477 |
Examples:
|
| 478 |
|
| 479 |
Returns:
|
|
@@ -483,172 +582,200 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
| 483 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
| 484 |
(nsfw) content, according to the `safety_checker`.
|
| 485 |
"""
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
batch_size = 1
|
| 501 |
-
|
| 502 |
-
image = self.prepare_image(
|
| 503 |
-
image=image,
|
| 504 |
-
width=width,
|
| 505 |
-
height=height,
|
| 506 |
-
dtype=self.controlnet.dtype,
|
| 507 |
-
)
|
| 508 |
|
| 509 |
-
|
| 510 |
-
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
| 511 |
-
# corresponds to doing no classifier free guidance.
|
| 512 |
-
do_classifier_free_guidance = guidance_scale > 1.0
|
| 513 |
|
| 514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
|
| 541 |
-
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
for i, t in enumerate(timesteps):
|
| 548 |
-
step = i // self.scheduler.order
|
| 549 |
-
do_batch = False
|
| 550 |
-
conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
|
| 551 |
-
try:
|
| 552 |
-
weight = conds_list[0][0][1]
|
| 553 |
-
except Exception:
|
| 554 |
-
weight = 1.0
|
| 555 |
-
if do_classifier_free_guidance:
|
| 556 |
-
uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
|
| 557 |
-
do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
|
| 558 |
-
|
| 559 |
-
# expand the latents if we are doing classifier free guidance
|
| 560 |
-
latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
|
| 561 |
-
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
| 562 |
-
|
| 563 |
-
if do_batch:
|
| 564 |
-
encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
|
| 565 |
-
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
| 566 |
-
latent_model_input,
|
| 567 |
-
t,
|
| 568 |
-
encoder_hidden_states=encoder_hidden_states,
|
| 569 |
-
controlnet_cond=paddle.concat([image, image]),
|
| 570 |
-
conditioning_scale=controlnet_conditioning_scale,
|
| 571 |
-
return_dict=False,
|
| 572 |
-
)
|
| 573 |
-
noise_pred = self.unet(
|
| 574 |
-
latent_model_input,
|
| 575 |
-
t,
|
| 576 |
-
encoder_hidden_states=encoder_hidden_states,
|
| 577 |
-
cross_attention_kwargs=cross_attention_kwargs,
|
| 578 |
-
down_block_additional_residuals=down_block_res_samples,
|
| 579 |
-
mid_block_additional_residual=mid_block_res_sample,
|
| 580 |
-
).sample
|
| 581 |
-
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
| 582 |
-
noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred_text - noise_pred_uncond)
|
| 583 |
-
else:
|
| 584 |
-
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
| 585 |
-
latent_model_input,
|
| 586 |
-
t,
|
| 587 |
-
encoder_hidden_states=cond_tensor,
|
| 588 |
-
controlnet_cond=image,
|
| 589 |
-
conditioning_scale=controlnet_conditioning_scale,
|
| 590 |
-
return_dict=False,
|
| 591 |
-
)
|
| 592 |
-
noise_pred = self.unet(
|
| 593 |
-
latent_model_input,
|
| 594 |
-
t,
|
| 595 |
-
encoder_hidden_states=cond_tensor,
|
| 596 |
-
cross_attention_kwargs=cross_attention_kwargs,
|
| 597 |
-
down_block_additional_residuals=down_block_res_samples,
|
| 598 |
-
mid_block_additional_residual=mid_block_res_sample,
|
| 599 |
-
).sample
|
| 600 |
|
| 601 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
| 603 |
latent_model_input,
|
| 604 |
t,
|
| 605 |
-
encoder_hidden_states=
|
| 606 |
controlnet_cond=image,
|
| 607 |
conditioning_scale=controlnet_conditioning_scale,
|
| 608 |
return_dict=False,
|
| 609 |
)
|
| 610 |
-
|
| 611 |
latent_model_input,
|
| 612 |
t,
|
| 613 |
-
encoder_hidden_states=
|
| 614 |
cross_attention_kwargs=cross_attention_kwargs,
|
| 615 |
down_block_additional_residuals=down_block_res_samples,
|
| 616 |
mid_block_additional_residual=mid_block_res_sample,
|
| 617 |
).sample
|
| 618 |
-
noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
|
| 619 |
-
|
| 620 |
-
# compute the previous noisy sample x_t -> x_t-1
|
| 621 |
-
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
| 622 |
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
|
| 653 |
|
| 654 |
# clip.py
|
|
|
|
| 15 |
#
|
| 16 |
# modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
|
| 17 |
# Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
|
|
|
|
|
|
|
| 18 |
|
| 19 |
import inspect
|
| 20 |
+
from pathlib import Path
|
| 21 |
from typing import Any, Callable, Dict, List, Optional, Union
|
| 22 |
|
| 23 |
import paddle
|
|
|
|
| 38 |
logging,
|
| 39 |
randn_tensor,
|
| 40 |
safetensors_load,
|
| 41 |
+
smart_load,
|
| 42 |
torch_load,
|
| 43 |
)
|
| 44 |
|
| 45 |
+
|
| 46 |
+
@paddle.no_grad()
|
| 47 |
+
def load_lora(
|
| 48 |
+
pipeline,
|
| 49 |
+
state_dict: dict,
|
| 50 |
+
LORA_PREFIX_UNET: str = "lora_unet",
|
| 51 |
+
LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
|
| 52 |
+
ratio: float = 1.0,
|
| 53 |
+
):
|
| 54 |
+
ratio = float(ratio)
|
| 55 |
+
visited = []
|
| 56 |
+
for key in state_dict:
|
| 57 |
+
if ".alpha" in key or ".lora_up" in key or key in visited:
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
if "text" in key:
|
| 61 |
+
tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
|
| 62 |
+
hf_to_ppnlp = {
|
| 63 |
+
"encoder": "transformer",
|
| 64 |
+
"fc1": "linear1",
|
| 65 |
+
"fc2": "linear2",
|
| 66 |
+
}
|
| 67 |
+
layer_infos = []
|
| 68 |
+
for layer_info in tmp_layer_infos:
|
| 69 |
+
if layer_info == "mlp":
|
| 70 |
+
continue
|
| 71 |
+
layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
|
| 72 |
+
curr_layer: paddle.nn.Linear = pipeline.text_encoder
|
| 73 |
+
else:
|
| 74 |
+
layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
|
| 75 |
+
curr_layer: paddle.nn.Linear = pipeline.unet
|
| 76 |
+
|
| 77 |
+
temp_name = layer_infos.pop(0)
|
| 78 |
+
while len(layer_infos) > -1:
|
| 79 |
+
try:
|
| 80 |
+
if temp_name == "to":
|
| 81 |
+
raise ValueError()
|
| 82 |
+
curr_layer = curr_layer.__getattr__(temp_name)
|
| 83 |
+
if len(layer_infos) > 0:
|
| 84 |
+
temp_name = layer_infos.pop(0)
|
| 85 |
+
elif len(layer_infos) == 0:
|
| 86 |
+
break
|
| 87 |
+
except Exception:
|
| 88 |
+
if len(temp_name) > 0:
|
| 89 |
+
temp_name += "_" + layer_infos.pop(0)
|
| 90 |
+
else:
|
| 91 |
+
temp_name = layer_infos.pop(0)
|
| 92 |
+
|
| 93 |
+
triplet_keys = [key, key.replace("lora_down", "lora_up"), key.replace("lora_down.weight", "alpha")]
|
| 94 |
+
dtype: paddle.dtype = curr_layer.weight.dtype
|
| 95 |
+
weight_down: paddle.Tensor = state_dict[triplet_keys[0]].cast(dtype)
|
| 96 |
+
weight_up: paddle.Tensor = state_dict[triplet_keys[1]].cast(dtype)
|
| 97 |
+
rank: float = float(weight_down.shape[0])
|
| 98 |
+
if triplet_keys[2] in state_dict:
|
| 99 |
+
alpha: float = state_dict[triplet_keys[2]].cast(dtype).item()
|
| 100 |
+
scale: float = alpha / rank
|
| 101 |
+
else:
|
| 102 |
+
scale = 1.0
|
| 103 |
+
|
| 104 |
+
if not hasattr(curr_layer, "backup_weights"):
|
| 105 |
+
curr_layer.backup_weights = curr_layer.weight.clone()
|
| 106 |
+
|
| 107 |
+
if len(weight_down.shape) == 4:
|
| 108 |
+
if weight_down.shape[2:4] == [1, 1]:
|
| 109 |
+
# conv2d 1x1
|
| 110 |
+
curr_layer.weight.copy_(
|
| 111 |
+
curr_layer.weight
|
| 112 |
+
+ ratio
|
| 113 |
+
* paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
|
| 114 |
+
* scale,
|
| 115 |
+
True,
|
| 116 |
+
)
|
| 117 |
+
else:
|
| 118 |
+
# conv2d 3x3
|
| 119 |
+
curr_layer.weight.copy_(
|
| 120 |
+
curr_layer.weight
|
| 121 |
+
+ ratio
|
| 122 |
+
* paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
|
| 123 |
+
[1, 0, 2, 3]
|
| 124 |
+
)
|
| 125 |
+
* scale,
|
| 126 |
+
True,
|
| 127 |
+
)
|
| 128 |
+
else:
|
| 129 |
+
# linear
|
| 130 |
+
curr_layer.weight.copy_(curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale, True)
|
| 131 |
+
|
| 132 |
+
# update visited list
|
| 133 |
+
visited.extend(triplet_keys)
|
| 134 |
+
return pipeline
|
| 135 |
+
|
| 136 |
+
|
| 137 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
| 138 |
|
| 139 |
|
|
|
|
| 230 |
"kdpm2-ancestral",
|
| 231 |
"kdpm2",
|
| 232 |
]
|
| 233 |
+
self.weights_has_changed = False
|
| 234 |
|
| 235 |
def add_ti_embedding_dir(self, embeddings_dir):
|
| 236 |
self.sj.embedding_db.add_embedding_dir(embeddings_dir)
|
|
|
|
| 240 |
self.sj.embedding_db.clear_embedding_dirs()
|
| 241 |
self.sj.embedding_db.load_textual_inversion_embeddings(True)
|
| 242 |
|
| 243 |
+
def change_scheduler(self, scheduler_type="ddim"):
|
| 244 |
+
self.switch_scheduler(scheduler_type)
|
| 245 |
+
|
| 246 |
def switch_scheduler(self, scheduler_type="ddim"):
|
| 247 |
scheduler_type = scheduler_type.lower()
|
| 248 |
from ppdiffusers import (
|
|
|
|
| 505 |
callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
|
| 506 |
callback_steps: Optional[int] = 1,
|
| 507 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
| 508 |
+
clip_skip: int = 1,
|
| 509 |
controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
|
| 510 |
+
lora_dir: str = "./loras",
|
| 511 |
):
|
| 512 |
r"""
|
| 513 |
Function invoked when calling the pipeline for generation.
|
|
|
|
| 565 |
A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
|
| 566 |
`self.processor` in
|
| 567 |
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
| 568 |
+
clip_skip (`int`, *optional*, defaults to 1):
|
| 569 |
+
CLIP_stop_at_last_layers, if clip_skip <= 1, we will use the last_hidden_state from text_encoder.
|
| 570 |
controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
|
| 571 |
The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
|
| 572 |
to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
|
| 573 |
corresponding scale as a list.
|
| 574 |
+
lora_dir (`str`, *optional*):
|
| 575 |
+
Path to lora which we want to load.
|
| 576 |
Examples:
|
| 577 |
|
| 578 |
Returns:
|
|
|
|
| 582 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
| 583 |
(nsfw) content, according to the `safety_checker`.
|
| 584 |
"""
|
| 585 |
+
try:
|
| 586 |
+
# 0. Default height and width to unet
|
| 587 |
+
height, width = self._default_height_width(height, width, image)
|
| 588 |
+
|
| 589 |
+
# 1. Check inputs. Raise error if not correct
|
| 590 |
+
self.check_inputs(
|
| 591 |
+
prompt,
|
| 592 |
+
image,
|
| 593 |
+
height,
|
| 594 |
+
width,
|
| 595 |
+
callback_steps,
|
| 596 |
+
negative_prompt,
|
| 597 |
+
controlnet_conditioning_scale,
|
| 598 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
|
| 600 |
+
batch_size = 1
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
+
image = self.prepare_image(
|
| 603 |
+
image=image,
|
| 604 |
+
width=width,
|
| 605 |
+
height=height,
|
| 606 |
+
dtype=self.controlnet.dtype,
|
| 607 |
+
)
|
| 608 |
|
| 609 |
+
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
| 610 |
+
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
| 611 |
+
# corresponds to doing no classifier free guidance.
|
| 612 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
| 613 |
+
|
| 614 |
+
prompts, extra_network_data = parse_prompts([prompt])
|
| 615 |
+
|
| 616 |
+
if lora_dir is not None and os.path.exists(lora_dir):
|
| 617 |
+
lora_mapping = {p.stem: p.absolute() for p in Path(lora_dir).glob("*.safetensors")}
|
| 618 |
+
for params in extra_network_data["lora"]:
|
| 619 |
+
assert len(params.items) > 0
|
| 620 |
+
name = params.items[0]
|
| 621 |
+
if name in lora_mapping:
|
| 622 |
+
ratio = float(params.items[1]) if len(params.items) > 1 else 1.0
|
| 623 |
+
lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device())
|
| 624 |
+
self.weights_has_changed = True
|
| 625 |
+
load_lora(self, state_dict=lora_state_dict, ratio=ratio)
|
| 626 |
+
del lora_state_dict
|
| 627 |
+
else:
|
| 628 |
+
print(f"We can't find lora weight: {name}! Please make sure that exists!")
|
| 629 |
+
|
| 630 |
+
self.sj.clip.CLIP_stop_at_last_layers = clip_skip
|
| 631 |
+
# 3. Encode input prompt
|
| 632 |
+
prompt_embeds, negative_prompt_embeds = self._encode_prompt(
|
| 633 |
+
prompts,
|
| 634 |
+
do_classifier_free_guidance,
|
| 635 |
+
negative_prompt,
|
| 636 |
+
num_inference_steps=num_inference_steps,
|
| 637 |
+
)
|
| 638 |
|
| 639 |
+
# 4. Prepare timesteps
|
| 640 |
+
self.scheduler.set_timesteps(num_inference_steps)
|
| 641 |
+
timesteps = self.scheduler.timesteps
|
| 642 |
+
|
| 643 |
+
# 5. Prepare latent variables
|
| 644 |
+
num_channels_latents = self.unet.in_channels
|
| 645 |
+
latents = self.prepare_latents(
|
| 646 |
+
batch_size,
|
| 647 |
+
num_channels_latents,
|
| 648 |
+
height,
|
| 649 |
+
width,
|
| 650 |
+
self.unet.dtype,
|
| 651 |
+
generator,
|
| 652 |
+
latents,
|
| 653 |
+
)
|
| 654 |
|
| 655 |
+
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
| 656 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
| 657 |
+
|
| 658 |
+
# 7. Denoising loop
|
| 659 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
| 660 |
+
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
| 661 |
+
for i, t in enumerate(timesteps):
|
| 662 |
+
step = i // self.scheduler.order
|
| 663 |
+
do_batch = False
|
| 664 |
+
conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
|
| 665 |
+
try:
|
| 666 |
+
weight = conds_list[0][0][1]
|
| 667 |
+
except Exception:
|
| 668 |
+
weight = 1.0
|
| 669 |
+
if do_classifier_free_guidance:
|
| 670 |
+
uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
|
| 671 |
+
do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
|
| 672 |
|
| 673 |
+
# expand the latents if we are doing classifier free guidance
|
| 674 |
+
latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
|
| 675 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
|
| 677 |
+
if do_batch:
|
| 678 |
+
encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
|
| 679 |
+
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
| 680 |
+
latent_model_input,
|
| 681 |
+
t,
|
| 682 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 683 |
+
controlnet_cond=paddle.concat([image, image]),
|
| 684 |
+
conditioning_scale=controlnet_conditioning_scale,
|
| 685 |
+
return_dict=False,
|
| 686 |
+
)
|
| 687 |
+
noise_pred = self.unet(
|
| 688 |
+
latent_model_input,
|
| 689 |
+
t,
|
| 690 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 691 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
| 692 |
+
down_block_additional_residuals=down_block_res_samples,
|
| 693 |
+
mid_block_additional_residual=mid_block_res_sample,
|
| 694 |
+
).sample
|
| 695 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
| 696 |
+
noise_pred = noise_pred_uncond + weight * guidance_scale * (
|
| 697 |
+
noise_pred_text - noise_pred_uncond
|
| 698 |
+
)
|
| 699 |
+
else:
|
| 700 |
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
| 701 |
latent_model_input,
|
| 702 |
t,
|
| 703 |
+
encoder_hidden_states=cond_tensor,
|
| 704 |
controlnet_cond=image,
|
| 705 |
conditioning_scale=controlnet_conditioning_scale,
|
| 706 |
return_dict=False,
|
| 707 |
)
|
| 708 |
+
noise_pred = self.unet(
|
| 709 |
latent_model_input,
|
| 710 |
t,
|
| 711 |
+
encoder_hidden_states=cond_tensor,
|
| 712 |
cross_attention_kwargs=cross_attention_kwargs,
|
| 713 |
down_block_additional_residuals=down_block_res_samples,
|
| 714 |
mid_block_additional_residual=mid_block_res_sample,
|
| 715 |
).sample
|
|
|
|
|
|
|
|
|
|
|
|
|
| 716 |
|
| 717 |
+
if do_classifier_free_guidance:
|
| 718 |
+
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
| 719 |
+
latent_model_input,
|
| 720 |
+
t,
|
| 721 |
+
encoder_hidden_states=uncond_tensor,
|
| 722 |
+
controlnet_cond=image,
|
| 723 |
+
conditioning_scale=controlnet_conditioning_scale,
|
| 724 |
+
return_dict=False,
|
| 725 |
+
)
|
| 726 |
+
noise_pred_uncond = self.unet(
|
| 727 |
+
latent_model_input,
|
| 728 |
+
t,
|
| 729 |
+
encoder_hidden_states=uncond_tensor,
|
| 730 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
| 731 |
+
down_block_additional_residuals=down_block_res_samples,
|
| 732 |
+
mid_block_additional_residual=mid_block_res_sample,
|
| 733 |
+
).sample
|
| 734 |
+
noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
|
| 735 |
+
|
| 736 |
+
# compute the previous noisy sample x_t -> x_t-1
|
| 737 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
| 738 |
+
|
| 739 |
+
# call the callback, if provided
|
| 740 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
| 741 |
+
progress_bar.update()
|
| 742 |
+
if callback is not None and i % callback_steps == 0:
|
| 743 |
+
callback(i, t, latents)
|
| 744 |
+
|
| 745 |
+
if output_type == "latent":
|
| 746 |
+
image = latents
|
| 747 |
+
has_nsfw_concept = None
|
| 748 |
+
elif output_type == "pil":
|
| 749 |
+
# 8. Post-processing
|
| 750 |
+
image = self.decode_latents(latents)
|
| 751 |
+
|
| 752 |
+
# 9. Run safety checker
|
| 753 |
+
image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
|
| 754 |
+
|
| 755 |
+
# 10. Convert to PIL
|
| 756 |
+
image = self.numpy_to_pil(image)
|
| 757 |
+
else:
|
| 758 |
+
# 8. Post-processing
|
| 759 |
+
image = self.decode_latents(latents)
|
| 760 |
+
|
| 761 |
+
# 9. Run safety checker
|
| 762 |
+
image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
|
| 763 |
+
|
| 764 |
+
if not return_dict:
|
| 765 |
+
return (image, has_nsfw_concept)
|
| 766 |
+
|
| 767 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
| 768 |
+
except Exception as e:
|
| 769 |
+
raise ValueError(e)
|
| 770 |
+
finally:
|
| 771 |
+
if self.weights_has_changed:
|
| 772 |
+
for sub_layer in self.text_encoder.sublayers(include_self=True):
|
| 773 |
+
if hasattr(sub_layer, "backup_weights"):
|
| 774 |
+
sub_layer.weight.copy_(sub_layer.backup_weights, True)
|
| 775 |
+
for sub_layer in self.unet.sublayers(include_self=True):
|
| 776 |
+
if hasattr(sub_layer, "backup_weights"):
|
| 777 |
+
sub_layer.weight.copy_(sub_layer.backup_weights, True)
|
| 778 |
+
self.weights_has_changed = False
|
| 779 |
|
| 780 |
|
| 781 |
# clip.py
|