update model

Browse files

Files changed (15) hide show

README.md +8 -30
bert/config.json +0 -3
feature_extractor/preprocessor_config.json +28 -0
model_index.json +16 -11
safety_checker/config.json +181 -0
sr/diffusion_pytorch_model.bin → safety_checker/pytorch_model.bin +2 -2
scheduler/scheduler_config.json +13 -3
sr/config.json +0 -3
text_encoder/config.json +25 -0
bert/diffusion_pytorch_model.bin → text_encoder/pytorch_model.bin +2 -2
tokenizer/tokenizer_config.json +2 -2
unet/config.json +18 -3
unet/diffusion_pytorch_model.bin +2 -2
{vqvae → vae}/config.json +5 -2
{vqvae → vae}/diffusion_pytorch_model.bin +0 -0

README.md CHANGED Viewed

@@ -8,40 +8,18 @@ tags:
 # Chinese Latent Diffusion Model
-我们开源了一个中文 Lattent Diffusion 模型，生成精美的美食图片。
 * Github: [EasyNLP](https://github.com/alibaba/EasyNLP)
-## 模型介绍
-模型分成三部分：
-* Text Encoder：把中文文本输入转化成 Embedding 向量
-* Latent Diffusion Model：在 Latent 空间中根据文本输入处理随机生成的噪声
-* Autoencoder：将 Latent 空间中的张量还原为图片
-* Super Resolution：提升图片分辨率
-我们使用中文模型CLIP-ViT-L作为Text Encoder，使用 [latent-diffusion](https://github.com/CompVis/latent-diffusion) 中的 Autoencoder，使用  [ESRGAN](https://github.com/xinntao/ESRGAN) 作为 Super Resolution 模型。我们使用 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集中的两千万图文对 Latent Diffusion Model 进行了预训练，并且在内部美食数据集上进行了微调。
-## 使用
-基于 Diffusers 开发，请先安装 Diffusers
-```
-pip install diffusers
-```
 ```python
-from LdmZhPipeline import LDMZhTextToImagePipeline
-generator = LDMZhTextToImagePipeline.from_pretrained("alibaba-pai/pai-diffusion-food-large-zh")
-generator.to("cuda")
-image = generator("小炒黄牛肉").images[0]
-image.save("food.png")
-```
-超分辨率模块默认是关闭的，如需启用，请添加参数 `use_sr=True`。
-```python
-image = generator("小炒黄牛肉", use_sr=True).images[0]
 ```

 # Chinese Latent Diffusion Model
+我们开源了一个中文 Lattent Diffusion 模型（美食）
 * Github: [EasyNLP](https://github.com/alibaba/EasyNLP)
 ```python
+from diffusers import StableDiffusionPipeline
+model_id = "alibaba-pai/pai-diffusion-food-large-zh"
+pipe = StableDiffusionPipeline.from_pretrained(model_id)
+pipe = pipe.to("cuda")
+prompt = "番茄炒蛋"
+image = pipe(prompt).images[0]
+image.save("result.png")
 ```

bert/config.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "_name_or_path": "./WukongClipTextEncoder"
-}

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

model_index.json CHANGED Viewed

@@ -1,13 +1,22 @@
 {
-  "_class_name": "LDMTextToImagePipeline",
-  "_diffusers_version": "0.0.4",
-  "bert": [
-    "LdmZhPipeline",
-    "WukongClipTextEncoder"
   ],
   "scheduler": [
     "diffusers",
-    "DDIMScheduler"
   ],
   "tokenizer": [
     "transformers",
@@ -17,12 +26,8 @@
     "diffusers",
     "UNet2DConditionModel"
   ],
-  "vqvae": [
     "diffusers",
     "AutoencoderKL"
-  ],
-  "sr": [
-    "LdmZhPipeline",
-    "ESRGAN"
   ]
 }

 {
+  "_class_name": "StableDiffusionPipeline",
+  "_diffusers_version": "0.15.0.dev0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "requires_safety_checker": true,
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
   ],
   "scheduler": [
     "diffusers",
+    "PNDMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
   ],
   "tokenizer": [
     "transformers",
     "diffusers",
     "UNet2DConditionModel"
   ],
+  "vae": [
     "diffusers",
     "AutoencoderKL"
   ]
 }

safety_checker/config.json ADDED Viewed

	@@ -0,0 +1,181 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "models/sdm1.4_with_ChTextEncoder/safety_checker",
+  "architectures": [
+    "StableDiffusionSafetyChecker"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.25.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.25.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_config_dict": {
+    "hidden_size": 1024,
+    "intermediate_size": 4096,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14
+  }
+}

sr/diffusion_pytorch_model.bin → safety_checker/pytorch_model.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e3e8b6f99d7377a864d9db9bf09d4d345db74dca801db1f4004757ed2ab7746
-size 67028637

 version https://git-lfs.github.com/spec/v1
+oid sha256:193490b58ef62739077262e833bf091c66c29488058681ac25cf7df3d8190974
+size 1216061799

scheduler/scheduler_config.json CHANGED Viewed

@@ -1,11 +1,21 @@
 {
-  "_class_name": "DDIMScheduler",
-  "_diffusers_version": "0.0.4",
   "beta_end": 0.012,
   "beta_schedule": "scaled_linear",
   "beta_start": 0.00085,
   "clip_sample": false,
   "num_train_timesteps": 1000,
-  "timestep_values": null,
   "trained_betas": null
 }

 {
+  "_class_name": "DPMSolverMultistepScheduler",
+  "_diffusers_version": "0.15.0.dev0",
+  "algorithm_type": "dpmsolver++",
   "beta_end": 0.012,
   "beta_schedule": "scaled_linear",
   "beta_start": 0.00085,
   "clip_sample": false,
+  "dynamic_thresholding_ratio": 0.995,
+  "lower_order_final": true,
   "num_train_timesteps": 1000,
+  "prediction_type": "epsilon",
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "solver_order": 2,
+  "solver_type": "midpoint",
+  "steps_offset": 1,
+  "thresholding": false,
   "trained_betas": null
 }

sr/config.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "_name_or_path": "./ESRGAN"
-}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "models/sdm1.4_with_ChTextEncoder/text_encoder",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 32,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "projection_dim": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "vocab_size": 21128
+}

bert/diffusion_pytorch_model.bin → text_encoder/pytorch_model.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c695ff77b6e92e4460596dd52dac62e20e0a0ccfbd92904c7f93054c4bb0a6b0
-size 407646444

 version https://git-lfs.github.com/spec/v1
+oid sha256:fb972018408b555a3fe5cfa52ee166e5a3b769cee81a50441d5f10bdc446defe
+size 405281495

tokenizer/tokenizer_config.json CHANGED Viewed

@@ -3,8 +3,8 @@
   "do_basic_tokenize": true,
   "do_lower_case": true,
   "mask_token": "[MASK]",
-  "model_max_length": 512,
-  "name_or_path": "fusing/latent-diffusion-text2im-large",
   "never_split": null,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",

   "do_basic_tokenize": true,
   "do_lower_case": true,
   "mask_token": "[MASK]",
+  "model_max_length": 32,
+  "name_or_path": "models/sdm1.4_with_ChTextEncoder/tokenizer",
   "never_split": null,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",

unet/config.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
   "_class_name": "UNet2DConditionModel",
-  "_diffusers_version": "0.0.4",
   "act_fn": "silu",
   "attention_head_dim": 8,
   "block_out_channels": [
@@ -10,6 +11,10 @@
     1280
   ],
   "center_input_sample": false,
   "down_block_types": [
     "CrossAttnDownBlock2D",
     "CrossAttnDownBlock2D",
@@ -17,20 +22,30 @@
     "DownBlock2D"
   ],
   "downsample_padding": 1,
   "flip_sin_to_cos": true,
   "freq_shift": 0,
   "in_channels": 4,
   "layers_per_block": 2,
   "mid_block_scale_factor": 1,
   "norm_eps": 1e-05,
   "norm_num_groups": 32,
   "out_channels": 4,
-  "sample_size": 32,
   "up_block_types": [
     "UpBlock2D",
     "CrossAttnUpBlock2D",
     "CrossAttnUpBlock2D",
     "CrossAttnUpBlock2D"
   ],
-  "cross_attention_dim": 768
 }

 {
   "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.15.0.dev0",
+  "_name_or_path": "models/20230303_food/checkpoint-35000/unet",
   "act_fn": "silu",
   "attention_head_dim": 8,
   "block_out_channels": [
     1280
   ],
   "center_input_sample": false,
+  "class_embed_type": null,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 768,
   "down_block_types": [
     "CrossAttnDownBlock2D",
     "CrossAttnDownBlock2D",
     "DownBlock2D"
   ],
   "downsample_padding": 1,
+  "dual_cross_attention": false,
   "flip_sin_to_cos": true,
   "freq_shift": 0,
   "in_channels": 4,
   "layers_per_block": 2,
   "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
   "norm_eps": 1e-05,
   "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
   "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
   "up_block_types": [
     "UpBlock2D",
     "CrossAttnUpBlock2D",
     "CrossAttnUpBlock2D",
     "CrossAttnUpBlock2D"
   ],
+  "upcast_attention": false,
+  "use_linear_projection": false
 }

unet/diffusion_pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f67fe536aeeecb1575f1bfbce3c3b1e2c9f9c43099520e01cfa85416affede4d
-size 3438316709

 version https://git-lfs.github.com/spec/v1
+oid sha256:bc7325ad594d2f48a26cab4d93167672c6455a1581e9eabab0033fe74cdfe205
+size 3438354725

{vqvae → vae}/config.json RENAMED Viewed

@@ -1,6 +1,7 @@
 {
   "_class_name": "AutoencoderKL",
-  "_diffusers_version": "0.1.2",
   "act_fn": "silu",
   "block_out_channels": [
     128,
@@ -17,8 +18,10 @@
   "in_channels": 3,
   "latent_channels": 4,
   "layers_per_block": 2,
   "out_channels": 3,
-  "sample_size": 256,
   "up_block_types": [
     "UpDecoderBlock2D",
     "UpDecoderBlock2D",

 {
   "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.15.0.dev0",
+  "_name_or_path": "models/sdm1.4_with_ChTextEncoder/vae",
   "act_fn": "silu",
   "block_out_channels": [
     128,
   "in_channels": 3,
   "latent_channels": 4,
   "layers_per_block": 2,
+  "norm_num_groups": 32,
   "out_channels": 3,
+  "sample_size": 512,
+  "scaling_factor": 0.18215,
   "up_block_types": [
     "UpDecoderBlock2D",
     "UpDecoderBlock2D",

{vqvae → vae}/diffusion_pytorch_model.bin RENAMED Viewed

File without changes