Artiprocher commited on
Commit
02640dd
1 Parent(s): 1f5210d

update model

Browse files
README.md CHANGED
@@ -8,40 +8,18 @@ tags:
8
 
9
  # Chinese Latent Diffusion Model
10
 
11
- 我们开源了一个中文 Lattent Diffusion 模型,生成精美的美食图片。
12
 
13
  * Github: [EasyNLP](https://github.com/alibaba/EasyNLP)
14
 
15
- ## 模型介绍
16
-
17
- 模型分成三部分:
18
-
19
- * Text Encoder:把中文文本输入转化成 Embedding 向量
20
- * Latent Diffusion Model:在 Latent 空间中根据文本输入处理随机生成的噪声
21
- * Autoencoder:将 Latent 空间中的张量还原为图片
22
- * Super Resolution:提升图片分辨率
23
-
24
- 我们使用中文模型CLIP-ViT-L作为Text Encoder,使用 [latent-diffusion](https://github.com/CompVis/latent-diffusion) 中的 Autoencoder,使用 [ESRGAN](https://github.com/xinntao/ESRGAN) 作为 Super Resolution 模型。我们使用 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集中的两千万图文对 Latent Diffusion Model 进行了预训练,并且在内部美食数据集上进行了微调。
25
-
26
- ## 使用
27
-
28
- 基于 Diffusers 开发,请先安装 Diffusers
29
-
30
- ```
31
- pip install diffusers
32
- ```
33
-
34
  ```python
35
- from LdmZhPipeline import LDMZhTextToImagePipeline
36
 
37
- generator = LDMZhTextToImagePipeline.from_pretrained("alibaba-pai/pai-diffusion-food-large-zh")
38
- generator.to("cuda")
39
- image = generator("小炒黄牛肉").images[0]
40
- image.save("food.png")
41
- ```
42
-
43
- 超分辨率模块默认是关闭的,如需启用,请添加参数 `use_sr=True`。
44
 
45
- ```python
46
- image = generator("小炒黄牛肉", use_sr=True).images[0]
 
47
  ```
 
8
 
9
  # Chinese Latent Diffusion Model
10
 
11
+ 我们开源了一个中文 Lattent Diffusion 模型(美食)
12
 
13
  * Github: [EasyNLP](https://github.com/alibaba/EasyNLP)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ```python
16
+ from diffusers import StableDiffusionPipeline
17
 
18
+ model_id = "alibaba-pai/pai-diffusion-food-large-zh"
19
+ pipe = StableDiffusionPipeline.from_pretrained(model_id)
20
+ pipe = pipe.to("cuda")
 
 
 
 
21
 
22
+ prompt = "番茄炒蛋"
23
+ image = pipe(prompt).images[0]
24
+ image.save("result.png")
25
  ```
bert/config.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "_name_or_path": "./WukongClipTextEncoder"
3
- }
 
 
 
 
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "feature_extractor_type": "CLIPFeatureExtractor",
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ }
28
+ }
model_index.json CHANGED
@@ -1,13 +1,22 @@
1
  {
2
- "_class_name": "LDMTextToImagePipeline",
3
- "_diffusers_version": "0.0.4",
4
- "bert": [
5
- "LdmZhPipeline",
6
- "WukongClipTextEncoder"
 
 
 
 
 
7
  ],
8
  "scheduler": [
9
  "diffusers",
10
- "DDIMScheduler"
 
 
 
 
11
  ],
12
  "tokenizer": [
13
  "transformers",
@@ -17,12 +26,8 @@
17
  "diffusers",
18
  "UNet2DConditionModel"
19
  ],
20
- "vqvae": [
21
  "diffusers",
22
  "AutoencoderKL"
23
- ],
24
- "sr": [
25
- "LdmZhPipeline",
26
- "ESRGAN"
27
  ]
28
  }
 
1
  {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.15.0.dev0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPImageProcessor"
7
+ ],
8
+ "requires_safety_checker": true,
9
+ "safety_checker": [
10
+ "stable_diffusion",
11
+ "StableDiffusionSafetyChecker"
12
  ],
13
  "scheduler": [
14
  "diffusers",
15
+ "PNDMScheduler"
16
+ ],
17
+ "text_encoder": [
18
+ "transformers",
19
+ "CLIPTextModel"
20
  ],
21
  "tokenizer": [
22
  "transformers",
 
26
  "diffusers",
27
  "UNet2DConditionModel"
28
  ],
29
+ "vae": [
30
  "diffusers",
31
  "AutoencoderKL"
 
 
 
 
32
  ]
33
  }
safety_checker/config.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "models/sdm1.4_with_ChTextEncoder/safety_checker",
4
+ "architectures": [
5
+ "StableDiffusionSafetyChecker"
6
+ ],
7
+ "initializer_factor": 1.0,
8
+ "logit_scale_init_value": 2.6592,
9
+ "model_type": "clip",
10
+ "projection_dim": 768,
11
+ "text_config": {
12
+ "_name_or_path": "",
13
+ "add_cross_attention": false,
14
+ "architectures": null,
15
+ "attention_dropout": 0.0,
16
+ "bad_words_ids": null,
17
+ "begin_suppress_tokens": null,
18
+ "bos_token_id": 0,
19
+ "chunk_size_feed_forward": 0,
20
+ "cross_attention_hidden_size": null,
21
+ "decoder_start_token_id": null,
22
+ "diversity_penalty": 0.0,
23
+ "do_sample": false,
24
+ "dropout": 0.0,
25
+ "early_stopping": false,
26
+ "encoder_no_repeat_ngram_size": 0,
27
+ "eos_token_id": 2,
28
+ "exponential_decay_length_penalty": null,
29
+ "finetuning_task": null,
30
+ "forced_bos_token_id": null,
31
+ "forced_eos_token_id": null,
32
+ "hidden_act": "quick_gelu",
33
+ "hidden_size": 768,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1"
37
+ },
38
+ "initializer_factor": 1.0,
39
+ "initializer_range": 0.02,
40
+ "intermediate_size": 3072,
41
+ "is_decoder": false,
42
+ "is_encoder_decoder": false,
43
+ "label2id": {
44
+ "LABEL_0": 0,
45
+ "LABEL_1": 1
46
+ },
47
+ "layer_norm_eps": 1e-05,
48
+ "length_penalty": 1.0,
49
+ "max_length": 20,
50
+ "max_position_embeddings": 77,
51
+ "min_length": 0,
52
+ "model_type": "clip_text_model",
53
+ "no_repeat_ngram_size": 0,
54
+ "num_attention_heads": 12,
55
+ "num_beam_groups": 1,
56
+ "num_beams": 1,
57
+ "num_hidden_layers": 12,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": 1,
63
+ "prefix": null,
64
+ "problem_type": null,
65
+ "projection_dim": 512,
66
+ "pruned_heads": {},
67
+ "remove_invalid_values": false,
68
+ "repetition_penalty": 1.0,
69
+ "return_dict": true,
70
+ "return_dict_in_generate": false,
71
+ "sep_token_id": null,
72
+ "suppress_tokens": null,
73
+ "task_specific_params": null,
74
+ "temperature": 1.0,
75
+ "tf_legacy_loss": false,
76
+ "tie_encoder_decoder": false,
77
+ "tie_word_embeddings": true,
78
+ "tokenizer_class": null,
79
+ "top_k": 50,
80
+ "top_p": 1.0,
81
+ "torch_dtype": null,
82
+ "torchscript": false,
83
+ "transformers_version": "4.25.1",
84
+ "typical_p": 1.0,
85
+ "use_bfloat16": false,
86
+ "vocab_size": 49408
87
+ },
88
+ "text_config_dict": {
89
+ "hidden_size": 768,
90
+ "intermediate_size": 3072,
91
+ "num_attention_heads": 12,
92
+ "num_hidden_layers": 12
93
+ },
94
+ "torch_dtype": "float32",
95
+ "transformers_version": null,
96
+ "vision_config": {
97
+ "_name_or_path": "",
98
+ "add_cross_attention": false,
99
+ "architectures": null,
100
+ "attention_dropout": 0.0,
101
+ "bad_words_ids": null,
102
+ "begin_suppress_tokens": null,
103
+ "bos_token_id": null,
104
+ "chunk_size_feed_forward": 0,
105
+ "cross_attention_hidden_size": null,
106
+ "decoder_start_token_id": null,
107
+ "diversity_penalty": 0.0,
108
+ "do_sample": false,
109
+ "dropout": 0.0,
110
+ "early_stopping": false,
111
+ "encoder_no_repeat_ngram_size": 0,
112
+ "eos_token_id": null,
113
+ "exponential_decay_length_penalty": null,
114
+ "finetuning_task": null,
115
+ "forced_bos_token_id": null,
116
+ "forced_eos_token_id": null,
117
+ "hidden_act": "quick_gelu",
118
+ "hidden_size": 1024,
119
+ "id2label": {
120
+ "0": "LABEL_0",
121
+ "1": "LABEL_1"
122
+ },
123
+ "image_size": 224,
124
+ "initializer_factor": 1.0,
125
+ "initializer_range": 0.02,
126
+ "intermediate_size": 4096,
127
+ "is_decoder": false,
128
+ "is_encoder_decoder": false,
129
+ "label2id": {
130
+ "LABEL_0": 0,
131
+ "LABEL_1": 1
132
+ },
133
+ "layer_norm_eps": 1e-05,
134
+ "length_penalty": 1.0,
135
+ "max_length": 20,
136
+ "min_length": 0,
137
+ "model_type": "clip_vision_model",
138
+ "no_repeat_ngram_size": 0,
139
+ "num_attention_heads": 16,
140
+ "num_beam_groups": 1,
141
+ "num_beams": 1,
142
+ "num_channels": 3,
143
+ "num_hidden_layers": 24,
144
+ "num_return_sequences": 1,
145
+ "output_attentions": false,
146
+ "output_hidden_states": false,
147
+ "output_scores": false,
148
+ "pad_token_id": null,
149
+ "patch_size": 14,
150
+ "prefix": null,
151
+ "problem_type": null,
152
+ "projection_dim": 512,
153
+ "pruned_heads": {},
154
+ "remove_invalid_values": false,
155
+ "repetition_penalty": 1.0,
156
+ "return_dict": true,
157
+ "return_dict_in_generate": false,
158
+ "sep_token_id": null,
159
+ "suppress_tokens": null,
160
+ "task_specific_params": null,
161
+ "temperature": 1.0,
162
+ "tf_legacy_loss": false,
163
+ "tie_encoder_decoder": false,
164
+ "tie_word_embeddings": true,
165
+ "tokenizer_class": null,
166
+ "top_k": 50,
167
+ "top_p": 1.0,
168
+ "torch_dtype": null,
169
+ "torchscript": false,
170
+ "transformers_version": "4.25.1",
171
+ "typical_p": 1.0,
172
+ "use_bfloat16": false
173
+ },
174
+ "vision_config_dict": {
175
+ "hidden_size": 1024,
176
+ "intermediate_size": 4096,
177
+ "num_attention_heads": 16,
178
+ "num_hidden_layers": 24,
179
+ "patch_size": 14
180
+ }
181
+ }
sr/diffusion_pytorch_model.bin → safety_checker/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e3e8b6f99d7377a864d9db9bf09d4d345db74dca801db1f4004757ed2ab7746
3
- size 67028637
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:193490b58ef62739077262e833bf091c66c29488058681ac25cf7df3d8190974
3
+ size 1216061799
scheduler/scheduler_config.json CHANGED
@@ -1,11 +1,21 @@
1
  {
2
- "_class_name": "DDIMScheduler",
3
- "_diffusers_version": "0.0.4",
 
4
  "beta_end": 0.012,
5
  "beta_schedule": "scaled_linear",
6
  "beta_start": 0.00085,
7
  "clip_sample": false,
 
 
8
  "num_train_timesteps": 1000,
9
- "timestep_values": null,
 
 
 
 
 
 
 
10
  "trained_betas": null
11
  }
 
1
  {
2
+ "_class_name": "DPMSolverMultistepScheduler",
3
+ "_diffusers_version": "0.15.0.dev0",
4
+ "algorithm_type": "dpmsolver++",
5
  "beta_end": 0.012,
6
  "beta_schedule": "scaled_linear",
7
  "beta_start": 0.00085,
8
  "clip_sample": false,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "lower_order_final": true,
11
  "num_train_timesteps": 1000,
12
+ "prediction_type": "epsilon",
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "skip_prk_steps": true,
16
+ "solver_order": 2,
17
+ "solver_type": "midpoint",
18
+ "steps_offset": 1,
19
+ "thresholding": false,
20
  "trained_betas": null
21
  }
sr/config.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "_name_or_path": "./ESRGAN"
3
- }
 
 
 
 
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/sdm1.4_with_ChTextEncoder/text_encoder",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "quick_gelu",
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 32,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.25.1",
24
+ "vocab_size": 21128
25
+ }
bert/diffusion_pytorch_model.bin → text_encoder/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c695ff77b6e92e4460596dd52dac62e20e0a0ccfbd92904c7f93054c4bb0a6b0
3
- size 407646444
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb972018408b555a3fe5cfa52ee166e5a3b769cee81a50441d5f10bdc446defe
3
+ size 405281495
tokenizer/tokenizer_config.json CHANGED
@@ -3,8 +3,8 @@
3
  "do_basic_tokenize": true,
4
  "do_lower_case": true,
5
  "mask_token": "[MASK]",
6
- "model_max_length": 512,
7
- "name_or_path": "fusing/latent-diffusion-text2im-large",
8
  "never_split": null,
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
 
3
  "do_basic_tokenize": true,
4
  "do_lower_case": true,
5
  "mask_token": "[MASK]",
6
+ "model_max_length": 32,
7
+ "name_or_path": "models/sdm1.4_with_ChTextEncoder/tokenizer",
8
  "never_split": null,
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
unet/config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "_class_name": "UNet2DConditionModel",
3
- "_diffusers_version": "0.0.4",
 
4
  "act_fn": "silu",
5
  "attention_head_dim": 8,
6
  "block_out_channels": [
@@ -10,6 +11,10 @@
10
  1280
11
  ],
12
  "center_input_sample": false,
 
 
 
 
13
  "down_block_types": [
14
  "CrossAttnDownBlock2D",
15
  "CrossAttnDownBlock2D",
@@ -17,20 +22,30 @@
17
  "DownBlock2D"
18
  ],
19
  "downsample_padding": 1,
 
20
  "flip_sin_to_cos": true,
21
  "freq_shift": 0,
22
  "in_channels": 4,
23
  "layers_per_block": 2,
24
  "mid_block_scale_factor": 1,
 
25
  "norm_eps": 1e-05,
26
  "norm_num_groups": 32,
 
 
27
  "out_channels": 4,
28
- "sample_size": 32,
 
 
 
 
 
29
  "up_block_types": [
30
  "UpBlock2D",
31
  "CrossAttnUpBlock2D",
32
  "CrossAttnUpBlock2D",
33
  "CrossAttnUpBlock2D"
34
  ],
35
- "cross_attention_dim": 768
 
36
  }
 
1
  {
2
  "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.15.0.dev0",
4
+ "_name_or_path": "models/20230303_food/checkpoint-35000/unet",
5
  "act_fn": "silu",
6
  "attention_head_dim": 8,
7
  "block_out_channels": [
 
11
  1280
12
  ],
13
  "center_input_sample": false,
14
+ "class_embed_type": null,
15
+ "conv_in_kernel": 3,
16
+ "conv_out_kernel": 3,
17
+ "cross_attention_dim": 768,
18
  "down_block_types": [
19
  "CrossAttnDownBlock2D",
20
  "CrossAttnDownBlock2D",
 
22
  "DownBlock2D"
23
  ],
24
  "downsample_padding": 1,
25
+ "dual_cross_attention": false,
26
  "flip_sin_to_cos": true,
27
  "freq_shift": 0,
28
  "in_channels": 4,
29
  "layers_per_block": 2,
30
  "mid_block_scale_factor": 1,
31
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
32
  "norm_eps": 1e-05,
33
  "norm_num_groups": 32,
34
+ "num_class_embeds": null,
35
+ "only_cross_attention": false,
36
  "out_channels": 4,
37
+ "projection_class_embeddings_input_dim": null,
38
+ "resnet_time_scale_shift": "default",
39
+ "sample_size": 64,
40
+ "time_cond_proj_dim": null,
41
+ "time_embedding_type": "positional",
42
+ "timestep_post_act": null,
43
  "up_block_types": [
44
  "UpBlock2D",
45
  "CrossAttnUpBlock2D",
46
  "CrossAttnUpBlock2D",
47
  "CrossAttnUpBlock2D"
48
  ],
49
+ "upcast_attention": false,
50
+ "use_linear_projection": false
51
  }
unet/diffusion_pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f67fe536aeeecb1575f1bfbce3c3b1e2c9f9c43099520e01cfa85416affede4d
3
- size 3438316709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7325ad594d2f48a26cab4d93167672c6455a1581e9eabab0033fe74cdfe205
3
+ size 3438354725
{vqvae → vae}/config.json RENAMED
@@ -1,6 +1,7 @@
1
  {
2
  "_class_name": "AutoencoderKL",
3
- "_diffusers_version": "0.1.2",
 
4
  "act_fn": "silu",
5
  "block_out_channels": [
6
  128,
@@ -17,8 +18,10 @@
17
  "in_channels": 3,
18
  "latent_channels": 4,
19
  "layers_per_block": 2,
 
20
  "out_channels": 3,
21
- "sample_size": 256,
 
22
  "up_block_types": [
23
  "UpDecoderBlock2D",
24
  "UpDecoderBlock2D",
 
1
  {
2
  "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.15.0.dev0",
4
+ "_name_or_path": "models/sdm1.4_with_ChTextEncoder/vae",
5
  "act_fn": "silu",
6
  "block_out_channels": [
7
  128,
 
18
  "in_channels": 3,
19
  "latent_channels": 4,
20
  "layers_per_block": 2,
21
+ "norm_num_groups": 32,
22
  "out_channels": 3,
23
+ "sample_size": 512,
24
+ "scaling_factor": 0.18215,
25
  "up_block_types": [
26
  "UpDecoderBlock2D",
27
  "UpDecoderBlock2D",
{vqvae → vae}/diffusion_pytorch_model.bin RENAMED
File without changes