add model

Browse files

Files changed (14) hide show

README.md +47 -1
bert/config.json +3 -0
bert/diffusion_pytorch_model.bin +3 -0
model_index.json +28 -0
scheduler/scheduler_config.json +11 -0
sr/config.json +3 -0
sr/diffusion_pytorch_model.bin +3 -0
tokenizer/special_tokens_map.json +7 -0
tokenizer/tokenizer_config.json +16 -0
tokenizer/vocab.txt +0 -0
unet/config.json +36 -0
unet/diffusion_pytorch_model.bin +3 -0
vqvae/config.json +28 -0
vqvae/diffusion_pytorch_model.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,49 @@
 ---
-license: mit
 ---

 ---
+license: apache-2.0
+tags:
+- pytorch
+- diffusers
+- text-to-image
 ---
+# Chinese Latent Diffusion Model
+我们开源了一个中文 Lattent Diffusion 模型，为中文古诗词生成精美配图
+* Github: [EasyNLP](https://github.com/alibaba/EasyNLP)
+## 模型介绍
+模型分成三部分：
+* Text Encoder：把中文文本输入转化成 Embedding 向量
+* Latent Diffusion Model：在 Latent 空间中根据文本输入处理随机生成的噪声
+* Autoencoder：将 Latent 空间中的张量还原为图片
+* Super Resolution：提升图片分辨率
+我们使用中文模型 [CLIP-ViT-L](https://wukong-dataset.github.io/wukong-dataset/benchmark.html) 作为 Text Encoder，使用 [latent-diffusion](https://github.com/CompVis/latent-diffusion) 中的 Autoencoder，使用  [ESRGAN](https://github.com/xinntao/ESRGAN) 作为 Super Resolution 模型。我们使用 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集中的两千万图文对 Latent Diffusion Model 进行了预训练。
+我们在私有美食数据集上进行了微调，以生成精美的美食图片。
+## 使用
+基于 Diffusers 开发，请先安装 Diffusers
+```
+pip install diffusers
+```
+```python
+from LdmZhPipeline import LDMZhTextToImagePipeline
+generator = LDMZhTextToImagePipeline.from_pretrained("alibaba-pai/pai-diffusion-food-large-zh")
+generator.to("cuda")
+image = generator("番茄炒蛋").images[0]
+image.save("food.png")
+```
+超分辨率模块默认是关闭的，如需启用，请添加参数 `use_sr=True`。
+```python
+image = generator("番茄炒蛋", use_sr=True).images[0]
+```

bert/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "_name_or_path": "./WukongClipTextEncoder"
+}

bert/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f631d47508ad78826f22d0498b57bf1c7bd6c6530c8ca16496e83acfec780415
+size 407646444

model_index.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_class_name": "LDMTextToImagePipeline",
+  "_diffusers_version": "0.0.4",
+  "bert": [
+    "LdmZhPipeline",
+    "WukongClipTextEncoder"
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDIMScheduler"
+  ],
+  "tokenizer": [
+    "transformers",
+    "BertTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vqvae": [
+    "diffusers",
+    "AutoencoderKL"
+  ],
+  "sr": [
+    "LdmZhPipeline",
+    "ESRGAN"
+  ]
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.0.4",
+  "beta_end": 0.012,
+  "beta_schedule": "linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "timestep_values": null,
+  "trained_betas": null
+}

sr/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "_name_or_path": "./ESRGAN"
+}

sr/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e3e8b6f99d7377a864d9db9bf09d4d345db74dca801db1f4004757ed2ab7746
+size 67028637

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "name_or_path": "fusing/latent-diffusion-text2im-large",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

tokenizer/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

unet/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.0.4",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "out_channels": 4,
+  "sample_size": 32,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "cross_attention_dim": 768
+}

unet/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d5961eef51e952fef92c1e7eeedbfda88dcbf2aa7542db7c45b101fcc97402a
+size 3438322101

vqvae/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.1.2",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "out_channels": 3,
+  "sample_size": 256,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vqvae/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b134cded8eb78b184aefb8805b6b572f36fa77b255c483665dda931fa0130c5
+size 334707217