piyushgrover commited on
Commit
477daa4
β€’
1 Parent(s): 17ab401

added code files

Browse files
Files changed (4) hide show
  1. README.md +7 -10
  2. app.py +102 -0
  3. requirements.txt +9 -0
  4. utils.py +222 -0
README.md CHANGED
@@ -1,13 +1,10 @@
1
  ---
2
- title: Stable Diffusion Image Generation
3
- emoji: πŸ“‰
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 3.49.0
8
- app_file: app.py
9
- pinned: false
10
  license: mit
 
 
 
 
 
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
 
 
 
 
 
 
 
 
2
  license: mit
3
+ title: YoloV3
4
+ sdk: gradio
5
+ colorFrom: yellow
6
+ colorTo: green
7
+ pinned: true
8
  ---
9
+ # yolov3
10
+ S13 ERA V1
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils import *
3
+ import random
4
+
5
+
6
+ is_clicked = False
7
+ out_img_list = [None, None, None, None, None]
8
+ out_state_list = [False, False, False, False, False]
9
+
10
+ def fn_query_on_load():
11
+ return "Cats at sunset"
12
+
13
+ def fn_refresh():
14
+
15
+ return out_img_list
16
+
17
+
18
+ with gr.Blocks() as app:
19
+ with gr.Row():
20
+ gr.Markdown(
21
+ """
22
+ # Stable Diffusion Image Generation
23
+ ### Enter query to generate images in various styles
24
+ """)
25
+
26
+ with gr.Row(visible=True):
27
+ with gr.Column():
28
+ with gr.Row():
29
+ search_text = gr.Textbox(value=fn_query_on_load, placeholder='Search..', label=None)
30
+
31
+ with gr.Row():
32
+ submit_btn = gr.Button("Submit", variant='primary')
33
+ clear_btn = gr.ClearButton()
34
+
35
+ with gr.Row(visible=True):
36
+ output_images = gr.Gallery(value=fn_refresh, interactive=False, every=5)
37
+
38
+
39
+ def clear_data():
40
+ return {
41
+ output_images: None,
42
+ search_text: None
43
+ }
44
+
45
+
46
+ clear_btn.click(clear_data, None, [output_images, search_text])
47
+
48
+
49
+ def func_generate(query):
50
+ global is_clicked
51
+ is_clicked = True
52
+ prompt = query + ' in the style of bulb'
53
+ text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True,
54
+ return_tensors="pt")
55
+ input_ids = text_input.input_ids.to(torch_device)
56
+
57
+ # Get token embeddings
58
+ position_ids = text_encoder.text_model.embeddings.position_ids[:, :77]
59
+ position_embeddings = pos_emb_layer(position_ids)
60
+
61
+ s = 0
62
+ for i in range(5):
63
+ token_embeddings = token_emb_layer(input_ids)
64
+ # The new embedding - our special birb word
65
+ replacement_token_embedding = concept_embeds[i].to(torch_device)
66
+
67
+ # Insert this into the token embeddings
68
+ token_embeddings[0, torch.where(input_ids[0] == 22373)] = replacement_token_embedding.to(torch_device)
69
+
70
+ # Combine with pos embs
71
+ input_embeddings = token_embeddings + position_embeddings
72
+
73
+ # Feed through to get final output embs
74
+ modified_output_embeddings = get_output_embeds(input_embeddings)
75
+
76
+ # And generate an image with this:
77
+
78
+ s = random.randint(s + 1, s + 30)
79
+ g = torch.manual_seed(s)
80
+ output = generate_with_embs(text_input, modified_output_embeddings, output=out_img_list[i], generator=g)
81
+ #output_images.append(dict(seed=s, output=output))
82
+
83
+ is_clicked = False
84
+
85
+ return None
86
+
87
+
88
+ submit_btn.click(
89
+ func_generate,
90
+ [search_text],
91
+ None
92
+ )
93
+
94
+
95
+ '''
96
+ Launch the app
97
+ '''
98
+ app.queue.launch(share=True)
99
+
100
+
101
+
102
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ pillow
4
+ gradio
5
+ numpy
6
+ transformers==4.25.1
7
+ diffusers
8
+ ftfy
9
+ accelerate
utils.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from base64 import b64encode
2
+
3
+ import numpy
4
+ import torch
5
+ from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
6
+ # from huggingface_hub import notebook_login
7
+
8
+ # For video display:
9
+ from IPython.display import HTML
10
+ from matplotlib import pyplot as plt
11
+ from pathlib import Path
12
+ from PIL import Image
13
+ from torch import autocast
14
+ from torchvision import transforms as tfms
15
+ from tqdm.auto import tqdm
16
+ from transformers import CLIPTextModel, CLIPTokenizer, logging
17
+ import os
18
+
19
+ torch.manual_seed(1)
20
+
21
+ # Supress some unnecessary warnings when loading the CLIPTextModel
22
+ logging.set_verbosity_error()
23
+
24
+ # Set device
25
+ torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
26
+ if "mps" == torch_device: os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"
27
+
28
+ import gc
29
+ gc.collect()
30
+ torch.cuda.empty_cache()
31
+
32
+ # Load the autoencoder model which will be used to decode the latents into image space.
33
+ vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
34
+
35
+ # Load the tokenizer and text encoder to tokenize and encode the text.
36
+ tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
37
+ text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
38
+
39
+ # The UNet model for generating the latents.
40
+ unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
41
+
42
+ # The noise scheduler
43
+ scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
44
+ num_train_timesteps=1000)
45
+
46
+ # To the GPU we go!
47
+ vae = vae.to(torch_device)
48
+ text_encoder = text_encoder.to(torch_device)
49
+ unet = unet.to(torch_device)
50
+
51
+
52
+ def load_learned_embeds():
53
+ pathlist = Path('learned_embeds/').glob('*_learned_embeds.bin')
54
+ learned_embeds = []
55
+
56
+ for path in pathlist:
57
+ path_in_str = str(path)
58
+ # print(path_in_str)
59
+ learned_embeds.append(torch.load(path_in_str))
60
+
61
+ concept_embeds_list = []
62
+ for obj in learned_embeds:
63
+ for k, v in obj.items():
64
+ if v.shape[0] == 768:
65
+ print(k, v.shape)
66
+ concept_embeds_list.append(v)
67
+
68
+ return torch.stack(concept_embeds_list)
69
+
70
+
71
+ def pil_to_latent(input_im):
72
+ # Single image -> single latent in a batch (so size 1, 4, 64, 64)
73
+ with torch.no_grad():
74
+ latent = vae.encode(tfms.ToTensor()(input_im).unsqueeze(0).to(torch_device) * 2 - 1) # Note scaling
75
+ return 0.18215 * latent.latent_dist.sample()
76
+
77
+
78
+ def latents_to_pil(latents):
79
+ # bath of latents -> list of images
80
+ latents = (1 / 0.18215) * latents
81
+ with torch.no_grad():
82
+ image = vae.decode(latents).sample
83
+ image = (image / 2 + 0.5).clamp(0, 1)
84
+ image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
85
+ images = (image * 255).round().astype("uint8")
86
+ pil_images = [Image.fromarray(image) for image in images]
87
+ return pil_images
88
+
89
+
90
+ # Prep Scheduler
91
+ def set_timesteps(scheduler, num_inference_steps):
92
+ scheduler.set_timesteps(num_inference_steps)
93
+ scheduler.timesteps = scheduler.timesteps.to(
94
+ torch.float32) # minor fix to ensure MPS compatibility, fixed in diffusers PR 3925
95
+
96
+
97
+ def get_output_embeds(input_embeddings):
98
+ # CLIP's text model uses causal mask, so we prepare it here:
99
+ bsz, seq_len = input_embeddings.shape[:2]
100
+ causal_attention_mask = text_encoder.text_model._build_causal_attention_mask(bsz, seq_len,
101
+ dtype=input_embeddings.dtype)
102
+
103
+ # Getting the output embeddings involves calling the model with passing output_hidden_states=True
104
+ # so that it doesn't just return the pooled final predictions:
105
+ encoder_outputs = text_encoder.text_model.encoder(
106
+ inputs_embeds=input_embeddings,
107
+ attention_mask=None, # We aren't using an attention mask so that can be None
108
+ causal_attention_mask=causal_attention_mask.to(torch_device),
109
+ output_attentions=None,
110
+ output_hidden_states=True, # We want the output embs not the final output
111
+ return_dict=None,
112
+ )
113
+
114
+ # We're interested in the output hidden state only
115
+ output = encoder_outputs[0]
116
+
117
+ # There is a final layer norm we need to pass these through
118
+ output = text_encoder.text_model.final_layer_norm(output)
119
+
120
+ # And now they're ready!
121
+ return output
122
+
123
+ def blue_loss(images):
124
+ # How far the pixels are from +80% contrast:
125
+ contrast = 230 # it ranges from -255 to +255
126
+ contrast_scale_factor = (259 * (contrast + 255)) / (255 * (259 - contrast))
127
+ cimgs = (contrast_scale_factor * (images - 0.5) + 0.5 )
128
+ cimgs = torch.where(cimgs > 1.0, 1.0, cimgs)
129
+ cimgs = torch.where(cimgs < 0.0, 0.0, cimgs)
130
+ error = torch.abs( images - cimgs ).mean()
131
+ #error = torch.abs(images[:] - 0.9).mean() # [:,2] -> all images in batch, only the blue channel
132
+ print('error: ', error)
133
+ return error
134
+
135
+ # Generating an image with these modified embeddings
136
+ def generate_with_embs(text_input, text_embeddings, output=None, generator=None, additional_guidance=False):
137
+ height = 512 # default height of Stable Diffusion
138
+ width = 512 # default width of Stable Diffusion
139
+ num_inference_steps = 30 # Number of denoising steps
140
+ guidance_scale = 7.5 # Scale for classifier-free guidance
141
+
142
+ if generator is None:
143
+ generator = torch.manual_seed(32) # Seed generator to create the inital latent noise
144
+
145
+ batch_size = 1
146
+
147
+ max_length = text_input.input_ids.shape[-1]
148
+ uncond_input = tokenizer(
149
+ [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
150
+ )
151
+ with torch.no_grad():
152
+ uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
153
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
154
+
155
+ # Prep Scheduler
156
+ set_timesteps(scheduler, num_inference_steps)
157
+
158
+ # Prep latents
159
+ latents = torch.randn(
160
+ (batch_size, unet.in_channels, height // 8, width // 8),
161
+ generator=generator,
162
+ )
163
+ latents = latents.to(torch_device)
164
+ latents = latents * scheduler.init_noise_sigma
165
+
166
+ # Loop
167
+ for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
168
+ # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
169
+ latent_model_input = torch.cat([latents] * 2)
170
+ sigma = scheduler.sigmas[i]
171
+ latent_model_input = scheduler.scale_model_input(latent_model_input, t)
172
+
173
+ # predict the noise residual
174
+ with torch.no_grad():
175
+ noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
176
+
177
+ # perform guidance
178
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
179
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
180
+
181
+ #### ADDITIONAL GUIDANCE ###
182
+ if additional_guidance:
183
+ blue_loss_scale = 80
184
+ if i % 5 == 0:
185
+ # Requires grad on the latents
186
+ latents = latents.detach().requires_grad_()
187
+
188
+ # Get the predicted x0:
189
+ latents_x0 = latents - sigma * noise_pred
190
+ # latents_x0 = scheduler.step(noise_pred, t, latents).pred_original_sample
191
+
192
+ # Decode to image space
193
+ denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
194
+
195
+ # Calculate loss
196
+ loss = blue_loss(denoised_images) * blue_loss_scale
197
+
198
+ # Occasionally print it out
199
+ if i % 10 == 0:
200
+ print(i, 'loss:', loss.item())
201
+
202
+ # Get gradient
203
+ cond_grad = torch.autograd.grad(loss, latents)[0]
204
+
205
+ # Modify the latents based on this gradient
206
+ latents = latents.detach() - cond_grad * sigma ** 2
207
+
208
+ # compute the previous noisy sample x_t -> x_t-1
209
+ latents = scheduler.step(noise_pred, t, latents).prev_sample
210
+ if output:
211
+ output = latents_to_pil(latents)[0]
212
+
213
+ return latents_to_pil(latents)[0]
214
+
215
+
216
+ concept_embeds = load_learned_embeds()
217
+
218
+ token_emb_layer = text_encoder.text_model.embeddings.token_embedding
219
+ #token_emb_layer # Vocab size 49408, emb_dim 768
220
+
221
+ pos_emb_layer = text_encoder.text_model.embeddings.position_embedding
222
+ #pos_emb_layer