Spaces:
Sleeping
Sleeping
import diffusers | |
import torch | |
import random | |
from tqdm import tqdm | |
from constants import SUBJECTS, MEDIUMS | |
from PIL import Image | |
class CLIPSlider: | |
def __init__( | |
self, | |
sd_pipe, | |
device: torch.device, | |
target_word: str = "", | |
opposite: str = "", | |
target_word_2nd: str = "", | |
opposite_2nd: str = "", | |
iterations: int = 300, | |
): | |
self.device = device | |
self.pipe = sd_pipe.to(self.device) | |
self.iterations = iterations | |
if target_word != "" or opposite != "": | |
self.avg_diff = self.find_latent_direction(target_word, opposite) | |
else: | |
self.avg_diff = None | |
if target_word_2nd != "" or opposite_2nd != "": | |
self.avg_diff_2nd = self.find_latent_direction(target_word_2nd, opposite_2nd) | |
else: | |
self.avg_diff_2nd = None | |
def find_latent_direction(self, | |
target_word:str, | |
opposite:str): | |
# lets identify a latent direction by taking differences between opposites | |
# target_word = "happy" | |
# opposite = "sad" | |
with torch.no_grad(): | |
positives = [] | |
negatives = [] | |
for i in tqdm(range(self.iterations)): | |
medium = random.choice(MEDIUMS) | |
subject = random.choice(SUBJECTS) | |
pos_prompt = f"a {medium} of a {target_word} {subject}" | |
neg_prompt = f"a {medium} of a {opposite} {subject}" | |
pos_toks = self.pipe.tokenizer(pos_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda() | |
neg_toks = self.pipe.tokenizer(neg_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda() | |
pos = self.pipe.text_encoder(pos_toks).pooler_output | |
neg = self.pipe.text_encoder(neg_toks).pooler_output | |
positives.append(pos) | |
negatives.append(neg) | |
positives = torch.cat(positives, dim=0) | |
negatives = torch.cat(negatives, dim=0) | |
diffs = positives - negatives | |
avg_diff = diffs.mean(0, keepdim=True) | |
return avg_diff | |
def generate(self, | |
prompt = "a photo of a house", | |
scale = 2., | |
scale_2nd = 0., # scale for the 2nd dim directions when avg_diff_2nd is not None | |
seed = 15, | |
only_pooler = False, | |
normalize_scales = False, # whether to normalize the scales when avg_diff_2nd is not None | |
correlation_weight_factor = 1.0, | |
avg_diff = None, | |
avg_diff_2nd = None, | |
**pipeline_kwargs | |
): | |
# if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true | |
# if pooler token only [-4,4] work well | |
with torch.no_grad(): | |
toks = self.pipe.tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda() | |
prompt_embeds = self.pipe.text_encoder(toks).last_hidden_state | |
if avg_diff_2nd and normalize_scales: | |
denominator = abs(scale) + abs(scale_2nd) | |
scale = scale / denominator | |
scale_2nd = scale_2nd / denominator | |
if only_pooler: | |
prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + avg_diff * scale | |
if avg_diff_2nd: | |
prompt_embeds[:, toks.argmax()] += avg_diff_2nd * scale_2nd | |
else: | |
normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True) | |
sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T | |
weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, 768) | |
standard_weights = torch.ones_like(weights) | |
weights = standard_weights + (weights - standard_weights) * correlation_weight_factor | |
# weights = torch.sigmoid((weights-0.5)*7) | |
prompt_embeds = prompt_embeds + ( | |
weights * avg_diff[None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale) | |
if avg_diff_2nd: | |
prompt_embeds += weights * avg_diff_2nd[None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale_2nd | |
torch.manual_seed(seed) | |
image = self.pipe(prompt_embeds=prompt_embeds, **pipeline_kwargs).images[0] | |
return image | |
def spectrum(self, | |
prompt="a photo of a house", | |
low_scale=-2, | |
low_scale_2nd=-2, | |
high_scale=2, | |
high_scale_2nd=2, | |
steps=5, | |
seed=15, | |
only_pooler=False, | |
normalize_scales=False, | |
correlation_weight_factor=1.0, | |
**pipeline_kwargs | |
): | |
images = [] | |
for i in range(steps): | |
scale = low_scale + (high_scale - low_scale) * i / (steps - 1) | |
scale_2nd = low_scale_2nd + (high_scale_2nd - low_scale_2nd) * i / (steps - 1) | |
image = self.generate(prompt, scale, scale_2nd, seed, only_pooler, normalize_scales, correlation_weight_factor, **pipeline_kwargs) | |
images.append(image[0]) | |
canvas = Image.new('RGB', (640 * steps, 640)) | |
for i, im in enumerate(images): | |
canvas.paste(im, (640 * i, 0)) | |
return canvas | |
class CLIPSliderXL(CLIPSlider): | |
def find_latent_direction(self, | |
target_word:str, | |
opposite:str): | |
# lets identify a latent direction by taking differences between opposites | |
# target_word = "happy" | |
# opposite = "sad" | |
with torch.no_grad(): | |
positives = [] | |
negatives = [] | |
positives2 = [] | |
negatives2 = [] | |
for i in tqdm(range(self.iterations)): | |
medium = random.choice(MEDIUMS) | |
subject = random.choice(SUBJECTS) | |
pos_prompt = f"a {medium} of a {target_word} {subject}" | |
neg_prompt = f"a {medium} of a {opposite} {subject}" | |
pos_toks = self.pipe.tokenizer(pos_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda() | |
neg_toks = self.pipe.tokenizer(neg_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda() | |
pos = self.pipe.text_encoder(pos_toks).pooler_output | |
neg = self.pipe.text_encoder(neg_toks).pooler_output | |
positives.append(pos) | |
negatives.append(neg) | |
pos_toks2 = self.pipe.tokenizer_2(pos_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer_2.model_max_length).input_ids.cuda() | |
neg_toks2 = self.pipe.tokenizer_2(neg_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer_2.model_max_length).input_ids.cuda() | |
pos2 = self.pipe.text_encoder_2(pos_toks2).text_embeds | |
neg2 = self.pipe.text_encoder_2(neg_toks2).text_embeds | |
positives2.append(pos2) | |
negatives2.append(neg2) | |
positives = torch.cat(positives, dim=0) | |
negatives = torch.cat(negatives, dim=0) | |
diffs = positives - negatives | |
avg_diff = diffs.mean(0, keepdim=True) | |
positives2 = torch.cat(positives2, dim=0) | |
negatives2 = torch.cat(negatives2, dim=0) | |
diffs2 = positives2 - negatives2 | |
avg_diff2 = diffs2.mean(0, keepdim=True) | |
return (avg_diff, avg_diff2) | |
def generate(self, | |
prompt = "a photo of a house", | |
scale = 2, | |
scale_2nd = 2, | |
seed = 15, | |
only_pooler = False, | |
normalize_scales = False, | |
correlation_weight_factor = 1.0, | |
avg_diff = None, | |
avg_diff_2nd= None, | |
**pipeline_kwargs | |
): | |
# if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true | |
# if pooler token only [-4,4] work well | |
text_encoders = [self.pipe.text_encoder, self.pipe.text_encoder_2] | |
tokenizers = [self.pipe.tokenizer, self.pipe.tokenizer_2] | |
with torch.no_grad(): | |
# toks = pipe.tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=77).input_ids.cuda() | |
# prompt_embeds = pipe.text_encoder(toks).last_hidden_state | |
prompt_embeds_list = [] | |
for i, text_encoder in enumerate(text_encoders): | |
tokenizer = tokenizers[i] | |
text_inputs = tokenizer( | |
prompt, | |
padding="max_length", | |
max_length=tokenizer.model_max_length, | |
truncation=True, | |
return_tensors="pt", | |
) | |
toks = text_inputs.input_ids | |
prompt_embeds = text_encoder( | |
toks.to(text_encoder.device), | |
output_hidden_states=True, | |
) | |
# We are only ALWAYS interested in the pooled output of the final text encoder | |
pooled_prompt_embeds = prompt_embeds[0] | |
prompt_embeds = prompt_embeds.hidden_states[-2] | |
if avg_diff_2nd and normalize_scales: | |
denominator = abs(scale) + abs(scale_2nd) | |
scale = scale / denominator | |
scale_2nd = scale_2nd / denominator | |
if only_pooler: | |
prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + avg_diff[0] * scale | |
if avg_diff_2nd: | |
prompt_embeds[:, toks.argmax()] += avg_diff_2nd[0] * scale_2nd | |
else: | |
normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True) | |
sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T | |
if i == 0: | |
weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, 768) | |
standard_weights = torch.ones_like(weights) | |
weights = standard_weights + (weights - standard_weights) * correlation_weight_factor | |
prompt_embeds = prompt_embeds + (weights * avg_diff[0][None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale) | |
if avg_diff_2nd: | |
prompt_embeds += (weights * avg_diff_2nd[0][None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale_2nd) | |
else: | |
weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, 1280) | |
standard_weights = torch.ones_like(weights) | |
weights = standard_weights + (weights - standard_weights) * correlation_weight_factor | |
prompt_embeds = prompt_embeds + (weights * avg_diff[1][None, :].repeat(1, self.pipe.tokenizer_2.model_max_length, 1) * scale) | |
if avg_diff_2nd: | |
prompt_embeds += (weights * avg_diff_2nd[1][None, :].repeat(1, self.pipe.tokenizer_2.model_max_length, 1) * scale_2nd) | |
bs_embed, seq_len, _ = prompt_embeds.shape | |
prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1) | |
prompt_embeds_list.append(prompt_embeds) | |
prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) | |
pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1) | |
torch.manual_seed(seed) | |
image = self.pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds, | |
**pipeline_kwargs).images[0] | |
return image | |
class CLIPSlider3(CLIPSlider): | |
def find_latent_direction(self, | |
target_word:str, | |
opposite:str): | |
# lets identify a latent direction by taking differences between opposites | |
# target_word = "happy" | |
# opposite = "sad" | |
with torch.no_grad(): | |
positives = [] | |
negatives = [] | |
positives2 = [] | |
negatives2 = [] | |
for i in tqdm(range(self.iterations)): | |
medium = random.choice(MEDIUMS) | |
subject = random.choice(SUBJECTS) | |
pos_prompt = f"a {medium} of a {target_word} {subject}" | |
neg_prompt = f"a {medium} of a {opposite} {subject}" | |
pos_toks = self.pipe.tokenizer(pos_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda() | |
neg_toks = self.pipe.tokenizer(neg_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda() | |
pos = self.pipe.text_encoder(pos_toks).text_embeds | |
neg = self.pipe.text_encoder(neg_toks).text_embeds | |
positives.append(pos) | |
negatives.append(neg) | |
pos_toks2 = self.pipe.tokenizer_2(pos_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer_2.model_max_length).input_ids.cuda() | |
neg_toks2 = self.pipe.tokenizer_2(neg_prompt, return_tensors="pt", padding="max_length", truncation=True, | |
max_length=self.pipe.tokenizer_2.model_max_length).input_ids.cuda() | |
pos2 = self.pipe.text_encoder_2(pos_toks2).text_embeds | |
neg2 = self.pipe.text_encoder_2(neg_toks2).text_embeds | |
positives2.append(pos2) | |
negatives2.append(neg2) | |
positives = torch.cat(positives, dim=0) | |
negatives = torch.cat(negatives, dim=0) | |
diffs = positives - negatives | |
avg_diff = diffs.mean(0, keepdim=True) | |
positives2 = torch.cat(positives2, dim=0) | |
negatives2 = torch.cat(negatives2, dim=0) | |
diffs2 = positives2 - negatives2 | |
avg_diff2 = diffs2.mean(0, keepdim=True) | |
return (avg_diff, avg_diff2) | |
def generate(self, | |
prompt = "a photo of a house", | |
scale = 2, | |
seed = 15, | |
only_pooler = False, | |
correlation_weight_factor = 1.0, | |
** pipeline_kwargs | |
): | |
# if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true | |
# if pooler token only [-4,4] work well | |
clip_text_encoders = [self.pipe.text_encoder, self.pipe.text_encoder_2] | |
clip_tokenizers = [self.pipe.tokenizer, self.pipe.tokenizer_2] | |
with torch.no_grad(): | |
# toks = pipe.tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=77).input_ids.cuda() | |
# prompt_embeds = pipe.text_encoder(toks).last_hidden_state | |
clip_prompt_embeds_list = [] | |
clip_pooled_prompt_embeds_list = [] | |
for i, text_encoder in enumerate(clip_text_encoders): | |
if i < 2: | |
tokenizer = clip_tokenizers[i] | |
text_inputs = tokenizer( | |
prompt, | |
padding="max_length", | |
max_length=tokenizer.model_max_length, | |
truncation=True, | |
return_tensors="pt", | |
) | |
toks = text_inputs.input_ids | |
prompt_embeds = text_encoder( | |
toks.to(text_encoder.device), | |
output_hidden_states=True, | |
) | |
# We are only ALWAYS interested in the pooled output of the final text encoder | |
pooled_prompt_embeds = prompt_embeds[0] | |
pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1) | |
clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds) | |
prompt_embeds = prompt_embeds.hidden_states[-2] | |
else: | |
text_inputs = self.pipe.tokenizer_3( | |
prompt, | |
padding="max_length", | |
max_length=self.tokenizer_max_length, | |
truncation=True, | |
add_special_tokens=True, | |
return_tensors="pt", | |
) | |
toks = text_inputs.input_ids | |
prompt_embeds = self.pipe.text_encoder_3(toks.to(self.device))[0] | |
t5_prompt_embed_shape = prompt_embeds.shape[-1] | |
if only_pooler: | |
prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + avg_diff[0] * scale | |
else: | |
normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True) | |
sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T | |
if i == 0: | |
weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, 768) | |
standard_weights = torch.ones_like(weights) | |
weights = standard_weights + (weights - standard_weights) * correlation_weight_factor | |
prompt_embeds = prompt_embeds + (weights * avg_diff[0][None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale) | |
else: | |
weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, 1280) | |
standard_weights = torch.ones_like(weights) | |
weights = standard_weights + (weights - standard_weights) * correlation_weight_factor | |
prompt_embeds = prompt_embeds + (weights * avg_diff[1][None, :].repeat(1, self.pipe.tokenizer_2.model_max_length, 1) * scale) | |
bs_embed, seq_len, _ = prompt_embeds.shape | |
prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1) | |
if i < 2: | |
clip_prompt_embeds_list.append(prompt_embeds) | |
clip_prompt_embeds = torch.concat(clip_prompt_embeds_list, dim=-1) | |
clip_pooled_prompt_embeds = torch.concat(clip_pooled_prompt_embeds_list, dim=-1) | |
clip_prompt_embeds = torch.nn.functional.pad( | |
clip_prompt_embeds, (0, t5_prompt_embed_shape - clip_prompt_embeds.shape[-1]) | |
) | |
prompt_embeds = torch.cat([clip_prompt_embeds, prompt_embeds], dim=-2) | |
torch.manual_seed(seed) | |
image = self.pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=clip_pooled_prompt_embeds, | |
**pipeline_kwargs).images[0] | |
return image |