if not use enable_model_cpu_offload(); How much does it take to run all on the gpu?
not use enable_model_cpu_offload(); How much does it take to run all on the gpu?
Similar, about 12.6 GB. You should install diffusers
from this commit: 3bdf25a
.
error:
ValueError: .to
is not supported for 4-bit
or 8-bit
bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct dtype
.
from transformers import T5EncoderModel
from diffusers import FluxTransformer2DModel, FluxPipeline
import torch
import gc
assert torch.cuda.is_available(), "CUDA is not available. Please check your GPU and CUDA installation."
device = torch.device("cuda:0")
def flush():
"""Wipes off memory."""
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
def bytes_to_giga_bytes(bytes):
return f"{(bytes / 1024 / 1024 / 1024):.3f}"
flush()
nf4_model_id = "sayakpaul/flux.1-dev-nf4-pkg"
text_encoder_2 = T5EncoderModel.from_pretrained(
nf4_model_id, subfolder="text_encoder_2", torch_dtype=torch.float16
).to(device)
transformer = FluxTransformer2DModel.from_pretrained(
nf4_model_id, subfolder="transformer", torch_dtype=torch.float16
).to(device)
pipe = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev",
text_encoder_2=text_encoder_2,
transformer=transformer,
torch_dtype=torch.float16
).to(device)
prompt = "A mystic cat with a sign that says hello world!"
generator = torch.Generator(device=device).manual_seed(0)
image = pipe(
prompt,
guidance_scale=3.5,
num_inference_steps=50,
generator=generator
).images[0]
image
FluxPipeline
NotImplementedError: Currently, device_map
is automatically inferred for quantized models. Support for providing device_map
as an input will be added in the future.
from transformers import T5EncoderModel
from diffusers import FluxTransformer2DModel, FluxPipeline
import torch
import gc
assert torch.cuda.is_available(), "CUDA is not available. Please check your GPU and CUDA installation."
device = torch.device("cuda:0")
def flush():
"""Wipes off memory."""
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
def bytes_to_giga_bytes(bytes):
return f"{(bytes / 1024 / 1024 / 1024):.3f}"
flush()
nf4_model_id = "sayakpaul/flux.1-dev-nf4-pkg"
text_encoder_2 = T5EncoderModel.from_pretrained(
nf4_model_id, subfolder="text_encoder_2", torch_dtype=torch.float16, device_map="auto"
)
transformer = FluxTransformer2DModel.from_pretrained(
nf4_model_id, subfolder="transformer", torch_dtype=torch.float16, device_map="auto"
)
pipe = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev",
text_encoder_2=text_encoder_2,
transformer=transformer,
torch_dtype=torch.float16,
device_map="auto"
)
prompt = "A mystic cat with a sign that says hello world!"
generator = torch.Generator(device=device).manual_seed(0)
image = pipe(
prompt,
guidance_scale=3.5,
num_inference_steps=50,
generator=generator
).images[0]
image
if let all run one gpu,how to change the code?
Yeah all of the above are correct. If you want to directly place your pipeline on a GPU i.e., do pipeline.to("cuda")
and not enable_model_cpu_offload()
, you can do so. You will have to install diffusers
from the latest commit of this PR: https://github.com/huggingface/diffusers/pull/9213.