|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gc |
|
import time |
|
import unittest |
|
|
|
import numpy as np |
|
import torch |
|
from huggingface_hub import hf_hub_download |
|
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer |
|
|
|
from diffusers import ( |
|
AutoencoderKL, |
|
DDIMScheduler, |
|
DPMSolverMultistepScheduler, |
|
EulerDiscreteScheduler, |
|
StableDiffusionPipeline, |
|
UNet2DConditionModel, |
|
) |
|
from diffusers.utils.testing_utils import ( |
|
enable_full_determinism, |
|
load_numpy, |
|
numpy_cosine_similarity_distance, |
|
require_torch_gpu, |
|
slow, |
|
torch_device, |
|
) |
|
|
|
|
|
enable_full_determinism() |
|
|
|
|
|
class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase): |
|
def setUp(self): |
|
|
|
super().setUp() |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
def tearDown(self): |
|
|
|
super().tearDown() |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
@property |
|
def dummy_cond_unet(self): |
|
torch.manual_seed(0) |
|
model = UNet2DConditionModel( |
|
block_out_channels=(32, 64), |
|
layers_per_block=2, |
|
sample_size=32, |
|
in_channels=4, |
|
out_channels=4, |
|
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), |
|
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), |
|
cross_attention_dim=32, |
|
|
|
attention_head_dim=(2, 4), |
|
use_linear_projection=True, |
|
) |
|
return model |
|
|
|
@property |
|
def dummy_vae(self): |
|
torch.manual_seed(0) |
|
model = AutoencoderKL( |
|
block_out_channels=[32, 64], |
|
in_channels=3, |
|
out_channels=3, |
|
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], |
|
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], |
|
latent_channels=4, |
|
sample_size=128, |
|
) |
|
return model |
|
|
|
@property |
|
def dummy_text_encoder(self): |
|
torch.manual_seed(0) |
|
config = CLIPTextConfig( |
|
bos_token_id=0, |
|
eos_token_id=2, |
|
hidden_size=32, |
|
intermediate_size=37, |
|
layer_norm_eps=1e-05, |
|
num_attention_heads=4, |
|
num_hidden_layers=5, |
|
pad_token_id=1, |
|
vocab_size=1000, |
|
|
|
hidden_act="gelu", |
|
projection_dim=64, |
|
) |
|
return CLIPTextModel(config) |
|
|
|
def test_stable_diffusion_v_pred_ddim(self): |
|
device = "cpu" |
|
unet = self.dummy_cond_unet |
|
scheduler = DDIMScheduler( |
|
beta_start=0.00085, |
|
beta_end=0.012, |
|
beta_schedule="scaled_linear", |
|
clip_sample=False, |
|
set_alpha_to_one=False, |
|
prediction_type="v_prediction", |
|
) |
|
|
|
vae = self.dummy_vae |
|
bert = self.dummy_text_encoder |
|
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") |
|
|
|
|
|
sd_pipe = StableDiffusionPipeline( |
|
unet=unet, |
|
scheduler=scheduler, |
|
vae=vae, |
|
text_encoder=bert, |
|
tokenizer=tokenizer, |
|
safety_checker=None, |
|
feature_extractor=None, |
|
image_encoder=None, |
|
requires_safety_checker=False, |
|
) |
|
sd_pipe = sd_pipe.to(device) |
|
sd_pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "A painting of a squirrel eating a burger" |
|
|
|
generator = torch.Generator(device=device).manual_seed(0) |
|
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") |
|
image = output.images |
|
|
|
generator = torch.Generator(device=device).manual_seed(0) |
|
image_from_tuple = sd_pipe( |
|
[prompt], |
|
generator=generator, |
|
guidance_scale=6.0, |
|
num_inference_steps=2, |
|
output_type="np", |
|
return_dict=False, |
|
)[0] |
|
|
|
image_slice = image[0, -3:, -3:, -1] |
|
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] |
|
|
|
assert image.shape == (1, 64, 64, 3) |
|
expected_slice = np.array([0.6569, 0.6525, 0.5142, 0.4968, 0.4923, 0.4601, 0.4996, 0.5041, 0.4544]) |
|
|
|
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 |
|
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 |
|
|
|
def test_stable_diffusion_v_pred_k_euler(self): |
|
device = "cpu" |
|
unet = self.dummy_cond_unet |
|
scheduler = EulerDiscreteScheduler( |
|
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="v_prediction" |
|
) |
|
vae = self.dummy_vae |
|
bert = self.dummy_text_encoder |
|
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") |
|
|
|
|
|
sd_pipe = StableDiffusionPipeline( |
|
unet=unet, |
|
scheduler=scheduler, |
|
vae=vae, |
|
text_encoder=bert, |
|
tokenizer=tokenizer, |
|
safety_checker=None, |
|
feature_extractor=None, |
|
image_encoder=None, |
|
requires_safety_checker=False, |
|
) |
|
sd_pipe = sd_pipe.to(device) |
|
sd_pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "A painting of a squirrel eating a burger" |
|
generator = torch.Generator(device=device).manual_seed(0) |
|
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") |
|
|
|
image = output.images |
|
|
|
generator = torch.Generator(device=device).manual_seed(0) |
|
image_from_tuple = sd_pipe( |
|
[prompt], |
|
generator=generator, |
|
guidance_scale=6.0, |
|
num_inference_steps=2, |
|
output_type="np", |
|
return_dict=False, |
|
)[0] |
|
|
|
image_slice = image[0, -3:, -3:, -1] |
|
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] |
|
|
|
assert image.shape == (1, 64, 64, 3) |
|
expected_slice = np.array([0.5644, 0.6514, 0.5190, 0.5663, 0.5287, 0.4953, 0.5430, 0.5243, 0.4778]) |
|
|
|
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 |
|
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 |
|
|
|
@unittest.skipIf(torch_device != "cuda", "This test requires a GPU") |
|
def test_stable_diffusion_v_pred_fp16(self): |
|
"""Test that stable diffusion v-prediction works with fp16""" |
|
unet = self.dummy_cond_unet |
|
scheduler = DDIMScheduler( |
|
beta_start=0.00085, |
|
beta_end=0.012, |
|
beta_schedule="scaled_linear", |
|
clip_sample=False, |
|
set_alpha_to_one=False, |
|
prediction_type="v_prediction", |
|
) |
|
vae = self.dummy_vae |
|
bert = self.dummy_text_encoder |
|
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") |
|
|
|
|
|
unet = unet.half() |
|
vae = vae.half() |
|
bert = bert.half() |
|
|
|
|
|
sd_pipe = StableDiffusionPipeline( |
|
unet=unet, |
|
scheduler=scheduler, |
|
vae=vae, |
|
text_encoder=bert, |
|
tokenizer=tokenizer, |
|
safety_checker=None, |
|
feature_extractor=None, |
|
image_encoder=None, |
|
requires_safety_checker=False, |
|
) |
|
sd_pipe = sd_pipe.to(torch_device) |
|
sd_pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "A painting of a squirrel eating a burger" |
|
generator = torch.manual_seed(0) |
|
image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images |
|
|
|
assert image.shape == (1, 64, 64, 3) |
|
|
|
|
|
@slow |
|
@require_torch_gpu |
|
class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): |
|
def setUp(self): |
|
|
|
super().setUp() |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
def tearDown(self): |
|
|
|
super().tearDown() |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
def test_stable_diffusion_v_pred_default(self): |
|
sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") |
|
sd_pipe = sd_pipe.to(torch_device) |
|
sd_pipe.enable_attention_slicing() |
|
sd_pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "A painting of a squirrel eating a burger" |
|
generator = torch.manual_seed(0) |
|
output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np") |
|
|
|
image = output.images |
|
image_slice = image[0, 253:256, 253:256, -1] |
|
|
|
assert image.shape == (1, 768, 768, 3) |
|
expected_slice = np.array([0.1868, 0.1922, 0.1527, 0.1921, 0.1908, 0.1624, 0.1779, 0.1652, 0.1734]) |
|
|
|
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 |
|
|
|
def test_stable_diffusion_v_pred_upcast_attention(self): |
|
sd_pipe = StableDiffusionPipeline.from_pretrained( |
|
"stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16 |
|
) |
|
sd_pipe = sd_pipe.to(torch_device) |
|
sd_pipe.enable_attention_slicing() |
|
sd_pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "A painting of a squirrel eating a burger" |
|
generator = torch.manual_seed(0) |
|
output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np") |
|
|
|
image = output.images |
|
image_slice = image[0, 253:256, 253:256, -1] |
|
|
|
assert image.shape == (1, 768, 768, 3) |
|
expected_slice = np.array([0.4209, 0.4087, 0.4097, 0.4209, 0.3860, 0.4329, 0.4280, 0.4324, 0.4187]) |
|
|
|
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 |
|
|
|
def test_stable_diffusion_v_pred_euler(self): |
|
scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler") |
|
sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler) |
|
sd_pipe = sd_pipe.to(torch_device) |
|
sd_pipe.enable_attention_slicing() |
|
sd_pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "A painting of a squirrel eating a burger" |
|
generator = torch.manual_seed(0) |
|
|
|
output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="np") |
|
image = output.images |
|
|
|
image_slice = image[0, 253:256, 253:256, -1] |
|
|
|
assert image.shape == (1, 768, 768, 3) |
|
expected_slice = np.array([0.1781, 0.1695, 0.1661, 0.1705, 0.1588, 0.1699, 0.2005, 0.1589, 0.1677]) |
|
|
|
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 |
|
|
|
def test_stable_diffusion_v_pred_dpm(self): |
|
""" |
|
TODO: update this test after making DPM compatible with V-prediction! |
|
""" |
|
scheduler = DPMSolverMultistepScheduler.from_pretrained( |
|
"stabilityai/stable-diffusion-2", |
|
subfolder="scheduler", |
|
final_sigmas_type="sigma_min", |
|
) |
|
sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler) |
|
sd_pipe = sd_pipe.to(torch_device) |
|
sd_pipe.enable_attention_slicing() |
|
sd_pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "a photograph of an astronaut riding a horse" |
|
generator = torch.manual_seed(0) |
|
image = sd_pipe( |
|
[prompt], generator=generator, guidance_scale=7.5, num_inference_steps=5, output_type="np" |
|
).images |
|
|
|
image_slice = image[0, 253:256, 253:256, -1] |
|
assert image.shape == (1, 768, 768, 3) |
|
expected_slice = np.array([0.3303, 0.3184, 0.3291, 0.3300, 0.3256, 0.3113, 0.2965, 0.3134, 0.3192]) |
|
|
|
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 |
|
|
|
def test_stable_diffusion_attention_slicing_v_pred(self): |
|
torch.cuda.reset_peak_memory_stats() |
|
model_id = "stabilityai/stable-diffusion-2" |
|
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) |
|
pipe.to(torch_device) |
|
pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "a photograph of an astronaut riding a horse" |
|
|
|
|
|
pipe.enable_attention_slicing() |
|
generator = torch.manual_seed(0) |
|
output_chunked = pipe( |
|
[prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="np" |
|
) |
|
image_chunked = output_chunked.images |
|
|
|
mem_bytes = torch.cuda.max_memory_allocated() |
|
torch.cuda.reset_peak_memory_stats() |
|
|
|
assert mem_bytes < 5.5 * 10**9 |
|
|
|
|
|
pipe.disable_attention_slicing() |
|
generator = torch.manual_seed(0) |
|
output = pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="np") |
|
image = output.images |
|
|
|
|
|
mem_bytes = torch.cuda.max_memory_allocated() |
|
assert mem_bytes > 3 * 10**9 |
|
max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten()) |
|
assert max_diff < 1e-3 |
|
|
|
def test_stable_diffusion_text2img_pipeline_v_pred_default(self): |
|
expected_image = load_numpy( |
|
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" |
|
"sd2-text2img/astronaut_riding_a_horse_v_pred.npy" |
|
) |
|
|
|
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") |
|
pipe.to(torch_device) |
|
pipe.enable_attention_slicing() |
|
pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "astronaut riding a horse" |
|
|
|
generator = torch.manual_seed(0) |
|
output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np") |
|
image = output.images[0] |
|
|
|
assert image.shape == (768, 768, 3) |
|
max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) |
|
assert max_diff < 1e-3 |
|
|
|
def test_stable_diffusion_text2img_pipeline_unflawed(self): |
|
expected_image = load_numpy( |
|
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" |
|
"sd2-text2img/lion_galaxy.npy" |
|
) |
|
|
|
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") |
|
pipe.scheduler = DDIMScheduler.from_config( |
|
pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True |
|
) |
|
pipe.enable_model_cpu_offload() |
|
pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" |
|
|
|
generator = torch.Generator("cpu").manual_seed(0) |
|
output = pipe( |
|
prompt=prompt, |
|
guidance_scale=7.5, |
|
num_inference_steps=10, |
|
guidance_rescale=0.7, |
|
generator=generator, |
|
output_type="np", |
|
) |
|
image = output.images[0] |
|
|
|
assert image.shape == (768, 768, 3) |
|
max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) |
|
assert max_diff < 5e-2 |
|
|
|
def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self): |
|
expected_image = load_numpy( |
|
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" |
|
"sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy" |
|
) |
|
|
|
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16) |
|
pipe.to(torch_device) |
|
pipe.set_progress_bar_config(disable=None) |
|
|
|
prompt = "astronaut riding a horse" |
|
|
|
generator = torch.manual_seed(0) |
|
output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np") |
|
image = output.images[0] |
|
|
|
assert image.shape == (768, 768, 3) |
|
max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) |
|
assert max_diff < 1e-3 |
|
|
|
def test_download_local(self): |
|
filename = hf_hub_download("stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.safetensors") |
|
|
|
pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16) |
|
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) |
|
pipe.enable_model_cpu_offload() |
|
|
|
image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] |
|
|
|
assert image_out.shape == (768, 768, 3) |
|
|
|
def test_stable_diffusion_text2img_intermediate_state_v_pred(self): |
|
number_of_steps = 0 |
|
|
|
def test_callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: |
|
test_callback_fn.has_been_called = True |
|
nonlocal number_of_steps |
|
number_of_steps += 1 |
|
if step == 0: |
|
latents = latents.detach().cpu().numpy() |
|
assert latents.shape == (1, 4, 96, 96) |
|
latents_slice = latents[0, -3:, -3:, -1] |
|
expected_slice = np.array([0.7749, 0.0325, 0.5088, 0.1619, 0.3372, 0.3667, -0.5186, 0.6860, 1.4326]) |
|
|
|
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 |
|
elif step == 19: |
|
latents = latents.detach().cpu().numpy() |
|
assert latents.shape == (1, 4, 96, 96) |
|
latents_slice = latents[0, -3:, -3:, -1] |
|
expected_slice = np.array([1.3887, 1.0273, 1.7266, 0.0726, 0.6611, 0.1598, -1.0547, 0.1522, 0.0227]) |
|
|
|
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 |
|
|
|
test_callback_fn.has_been_called = False |
|
|
|
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16) |
|
pipe = pipe.to(torch_device) |
|
pipe.set_progress_bar_config(disable=None) |
|
pipe.enable_attention_slicing() |
|
|
|
prompt = "Andromeda galaxy in a bottle" |
|
|
|
generator = torch.manual_seed(0) |
|
pipe( |
|
prompt=prompt, |
|
num_inference_steps=20, |
|
guidance_scale=7.5, |
|
generator=generator, |
|
callback=test_callback_fn, |
|
callback_steps=1, |
|
) |
|
assert test_callback_fn.has_been_called |
|
assert number_of_steps == 20 |
|
|
|
def test_stable_diffusion_low_cpu_mem_usage_v_pred(self): |
|
pipeline_id = "stabilityai/stable-diffusion-2" |
|
|
|
start_time = time.time() |
|
pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) |
|
pipeline_low_cpu_mem_usage.to(torch_device) |
|
low_cpu_mem_usage_time = time.time() - start_time |
|
|
|
start_time = time.time() |
|
_ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False) |
|
normal_load_time = time.time() - start_time |
|
|
|
assert 2 * low_cpu_mem_usage_time < normal_load_time |
|
|
|
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self): |
|
torch.cuda.empty_cache() |
|
torch.cuda.reset_max_memory_allocated() |
|
torch.cuda.reset_peak_memory_stats() |
|
|
|
pipeline_id = "stabilityai/stable-diffusion-2" |
|
prompt = "Andromeda galaxy in a bottle" |
|
|
|
pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) |
|
pipeline.enable_attention_slicing(1) |
|
pipeline.enable_sequential_cpu_offload() |
|
|
|
generator = torch.manual_seed(0) |
|
_ = pipeline(prompt, generator=generator, num_inference_steps=5) |
|
|
|
mem_bytes = torch.cuda.max_memory_allocated() |
|
|
|
assert mem_bytes < 2.8 * 10**9 |
|
|